From 7a4343df61e6094ae5a2b2eda36c0707d7c1fd2d Mon Sep 17 00:00:00 2001 From: jianyuzh Date: Wed, 27 Dec 2023 11:19:46 +0800 Subject: [PATCH 01/90] first update for migration --- CMakeLists.txt | 76 +- ggml-sycl.cpp | 12393 +++++++++++++++++++++++++++++++++++++++++++++++ ggml-sycl.hpp | 4 + ggml.h | 2 +- 4 files changed, 12461 insertions(+), 14 deletions(-) create mode 100644 ggml-sycl.cpp create mode 100644 ggml-sycl.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a333ff524b65..51089c3b5b742 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,6 @@ cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. project("llama.cpp" C CXX) +include(CheckIncludeFileCXX) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -96,11 +97,11 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) -option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) +option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) @@ -122,7 +123,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) # Compile flags # -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) @@ -338,18 +339,11 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) if (LLAMA_STATIC) - if (WIN32) - # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) - else () - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) - endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) else() set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) - if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) # 52 == lowest CUDA 12 standard # 60 == f16 CUDA intrinsics @@ -426,9 +420,6 @@ if (LLAMA_HIPBLAS) if (${hipblas_FOUND} AND ${hip_FOUND}) message(STATUS "HIP and hipBLAS found") add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) - if (LLAMA_HIP_UMA) - add_compile_definitions(GGML_HIP_UMA) - endif() add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) if (BUILD_SHARED_LIBS) set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) @@ -454,6 +445,64 @@ if (LLAMA_HIPBLAS) endif() endif() + +if (LLAMA_SYCL) + set(ENABLE_AOT ats) + if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for SYCL") + endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for SYCL") + endif() + + #find_package(SYCL REQUIRED) + find_package(IntelSYCL REQUIRED) + + # Check SYCL support by the compiler + check_cxx_compiler_flag("-fsycl" _fsycl_option) + if (_fsycl_option) + #set (CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} "/opt/intel/oneapi/compiler/2024.0/include") + CHECK_INCLUDE_FILE_CXX("sycl/sycl.hpp" _sycl_header "-fsycl") + set (_sycl_header "/opt/intel/oneapi/compiler/2024.0/include/sycl/sycl.hpp") + if (NOT _sycl_header) + CHECK_INCLUDE_FILE_CXX("CL/sycl.hpp" _sycl_header_old "-fsycl") + endif() + if (_sycl_header OR _sycl_header_old) + set(_sycl_support TRUE) + endif() + endif() + + if (_sycl_support) + add_compile_definitions(GGML_USE_CUBLAS) + #add_compile_options(-std=c++17 -O3 -fsycl) + add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include) + add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include/sycl) + add_compile_options(-I/opt/intel/oneapi/dpcpp-ct/2024.0/include) + add_compile_options(-I/opt/intel/oneapi/2024.0/include) + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") + + set(GGML_HEADERS_SYCL ggml-cuda.h ggml.h ggml-sycl.hpp) + set(GGML_SOURCES_SYCL ggml-sycl.cpp) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_sycl_lapack mkl_sycl_dft mkl_sycl_sparse mkl_sycl_vm mkl_sycl_rng mkl_sycl_stats mkl_sycl_data_fitting mkl_intel_ilp64 mkl_tbb_thread) + + #add_library(ggml-sycl OBJECT ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}) + #add_executable(${PROJECT_NAME} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}) + #target_link_libraries(ggml-sycl PRIVATE sycl) + #target_compile_options(${PROJECT_NAME} PRIVATE ${CMAKE_CXX_FLAGS}) + #set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl) + #add_sycl_to_target({}) + + else() + message(FATAL_ERROR "SYCL Support is not present") + endif() +endif() + + + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -790,6 +839,7 @@ add_library(ggml OBJECT ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} ) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp new file mode 100644 index 0000000000000..160cdf63a502f --- /dev/null +++ b/ggml-sycl.cpp @@ -0,0 +1,12393 @@ +#define DPCT_PROFILING_ENABLED +#define DPCT_COMPAT_RT_VERSION 12010 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#ifdef GGML_HIP_UMA +#define cudaMalloc hipMallocManaged +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) +#else +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#endif +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamFireAndForget hipStreamFireAndForget +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define __trap abort +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#else + +#if DPCT_COMPAT_RT_VERSION < 11020 +#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t +#endif // CUDART_VERSION < 11020 + +#endif // defined(GGML_USE_HIPBLAS) + +#include "ggml-cuda.h" +#include "ggml.h" +#include "ggml-backend-impl.h" +#include + +#include + +#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define CC_VOLTA 700 +#define CC_OFFSET_AMD 1000000 +#define CC_RDNA2 (CC_OFFSET_AMD + 1030) + +#define GGML_CUDA_MAX_NODES 8192 + +// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication +// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant +// for large computational tasks. the drawback is that this requires some extra amount of VRAM: +// - 7B quantum model: +100-200 MB +// - 13B quantum model: +200-400 MB +// +//#define GGML_CUDA_FORCE_MMQ + +// TODO: improve this to be correct for more hardware +// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores +// probably other such cases, and not sure what happens on AMD hardware +#if !defined(GGML_CUDA_FORCE_MMQ) +#define CUDA_USE_TENSOR_CORES +#endif + +// max batch size to use MMQ kernels when tensor cores are available +#define MMQ_MAX_BATCH_SIZE 32 + +#if defined(GGML_USE_HIPBLAS) +#define __CUDA_ARCH__ 1300 + +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ + defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3 +#endif + +#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) +#define RDNA2 +#endif + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) +} + +static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(__gfx1100__) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; + asm("\n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + " + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) + : "v"(a), "v"(b) + ); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; +} +#endif // defined(GGML_USE_HIPBLAS) + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +#if DPCT_COMPAT_RT_VERSION >= 12000 + static const char *cublas_get_error_str(const int err) { + /* + DPCT1009:63: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/ + ; + } +#else + static const char * cublas_get_error_str(const cublasStatus_t err) { + switch (err) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; + default: return "unknown error"; + } + } +#endif // CUDART_VERSION >= 12000 + +[[noreturn]] +static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) { + fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg); + fprintf(stderr, " in function %s at %s:%d\n", func, file, line); + GGML_ASSERT(!"CUDA error"); +} + +/* +DPCT1001:65: The statement could not be removed. +*/ +/* +DPCT1000:66: Error handling if-stmt was detected but could not be rewritten. +*/ +/* +DPCT1009:67: SYCL uses exceptions to report errors and does not use the error +codes. The original code was commented out and a warning string was inserted. +You need to rewrite this code. +*/ +#define CUDA_CHECK(err) do { \ + auto err_ = (err); if (err_ != 0) ggml_cuda_error( \ + #err, __func__, __FILE__, __LINE__, \ + "cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/); \ +} while (0) +#define CUBLAS_CHECK(err) \ + do { auto err_ = (err); if (err_ != 0) \ + ggml_cuda_error(#err, __func__, __FILE__, __LINE__, \ + cublas_get_error_str(err_)); } while (0) + +#if !defined(GGML_USE_HIPBLAS) +static const char *cu_get_error_str(int err) { + const char * err_str; + /* + DPCT1007:64: Migration of cuGetErrorString is not supported. + */ + cuGetErrorString(err, &err_str); + return err_str; +} +/* +DPCT1001:82: The statement could not be removed. +*/ +/* +DPCT1000:83: Error handling if-stmt was detected but could not be rewritten. +*/ +#define CU_CHECK(err) \ + do { auto err_ = (err); \ + if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, \ + cu_get_error_str(err_)); } while (0) +#endif + +#if DPCT_COMPAT_RT_VERSION >= 11100 +#define GGML_CUDA_ASSUME(x) __builtin_assume(x) +#else +#define GGML_CUDA_ASSUME(x) +#endif // CUDART_VERSION >= 11100 + +#ifdef GGML_CUDA_F16 +typedef half dfloat; // dequantize float +typedef half2 dfloat2; +#else +typedef float dfloat; // dequantize float +typedef sycl::float2 dfloat2; +#endif //GGML_CUDA_F16 + +static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) { + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8, + const int &i32) { + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8, + const int &i32) { + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8, + const int &i32) { + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +template +using to_t_cuda_t = void (*)(const void *__restrict__ x, T *__restrict__ y, + int k, dpct::queue_ptr stream); +typedef to_t_cuda_t to_fp32_cuda_t; +typedef to_t_cuda_t to_fp16_cuda_t; + +typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); +typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); +typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_cuda_op_mul_mat_t)( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream); +typedef void (*ggml_cuda_op_flatten_t)(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream); + +// QK = number of values after dequantization +// QR = QK / number of values before dequantization +// QI = number of 32 bit integers before dequantization + +#define QK4_0 32 +#define QR4_0 2 +#define QI4_0 (QK4_0 / (4 * QR4_0)) +typedef struct dpct_type_471834 { + sycl::half d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +#define QR4_1 2 +#define QI4_1 (QK4_1 / (4 * QR4_1)) +typedef struct dpct_type_143705 { + sycl::half2 dm; // dm.x = delta, dm.y = min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK5_0 32 +#define QR5_0 2 +#define QI5_0 (QK5_0 / (4 * QR5_0)) +typedef struct dpct_type_673649 { + sycl::half d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; +static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); + +#define QK5_1 32 +#define QR5_1 2 +#define QI5_1 (QK5_1 / (4 * QR5_1)) +typedef struct dpct_type_135589 { + sycl::half2 dm; // dm.x = delta, dm.y = min + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_1 / 2]; // nibbles / quants +} block_q5_1; +static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); + +#define QK8_0 32 +#define QR8_0 1 +#define QI8_0 (QK8_0 / (4 * QR8_0)) +typedef struct dpct_type_122878 { + sycl::half d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); + +#define QK8_1 32 +#define QR8_1 1 +#define QI8_1 (QK8_1 / (4 * QR8_1)) +typedef struct dpct_type_143721 { + sycl::half2 ds; // ds.x = delta, ds.y = sum + int8_t qs[QK8_0]; // quants +} block_q8_1; +static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); + +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); +typedef void (*allocate_tiles_cuda_t)(int **x_ql, sycl::half2 **x_dm, + int **x_qh, int **x_sc); +typedef void (*load_tiles_cuda_t)(const void *__restrict__ vx, + int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, + int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, + const int &i_max, const int &k, + const int &blocks_per_row); +typedef float (*vec_dot_q_mul_mat_cuda_t)( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms, + const int &i, const int &j, const int &k); + +//================================= k-quants + +#ifdef GGML_QKK_64 +#define QK_K 64 +#define K_SCALE_SIZE 4 +#else +#define QK_K 256 +#define K_SCALE_SIZE 12 +#endif + +#define QR2_K 4 +#define QI2_K (QK_K / (4*QR2_K)) +typedef struct dpct_type_619598 { + uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits + uint8_t qs[QK_K/4]; // quants + sycl::half2 dm; // super-block scale for quantized scales/mins +} block_q2_K; +static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); + +#define QR3_K 4 +#define QI3_K (QK_K / (4*QR3_K)) +typedef struct dpct_type_138576 { + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits +#ifdef GGML_QKK_64 + uint8_t scales[2]; // scales, quantized with 8 bits +#else + uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits +#endif + sycl::half d; // super-block scale +} block_q3_K; +//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); + +#define QR4_K 2 +#define QI4_K (QK_K / (4*QR4_K)) +#ifdef GGML_QKK_64 +typedef struct { + half dm[2]; // super-block scales/mins + uint8_t scales[2]; // 4-bit block scales/mins + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding"); +#else +typedef struct dpct_type_154943 { + sycl::half2 dm; // super-block scale for quantized scales/mins + uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); +#endif + +#define QR5_K 2 +#define QI5_K (QK_K / (4*QR5_K)) +#ifdef GGML_QKK_64 +typedef struct { + half d; // super-block scale + int8_t scales[QK_K/16]; // block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); +#else +typedef struct dpct_type_866817 { + sycl::half2 dm; // super-block scale for quantized scales/mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +#endif + +#define QR6_K 2 +#define QI6_K (QK_K / (4*QR6_K)) +typedef struct dpct_type_107281 { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales + sycl::half d; // delta +} block_q6_K; +static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); + +#define WARP_SIZE 32 +#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses + +#define CUDA_GELU_BLOCK_SIZE 256 +#define CUDA_SILU_BLOCK_SIZE 256 +#define CUDA_TANH_BLOCK_SIZE 256 +#define CUDA_RELU_BLOCK_SIZE 256 +#define CUDA_SQR_BLOCK_SIZE 256 +#define CUDA_CPY_BLOCK_SIZE 32 +#define CUDA_SCALE_BLOCK_SIZE 256 +#define CUDA_CLAMP_BLOCK_SIZE 256 +#define CUDA_ROPE_BLOCK_SIZE 256 +#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 +#define CUDA_ALIBI_BLOCK_SIZE 32 +#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 +#define CUDA_QUANTIZE_BLOCK_SIZE 256 +#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 +#define CUDA_GET_ROWS_BLOCK_SIZE 256 +#define CUDA_UPSCALE_BLOCK_SIZE 256 +#define CUDA_CONCAT_BLOCK_SIZE 256 +#define CUDA_PAD_BLOCK_SIZE 256 +#define CUDA_ACC_BLOCK_SIZE 256 +#define CUDA_IM2COL_BLOCK_SIZE 256 + +// dmmv = dequantize_mul_mat_vec +#ifndef GGML_CUDA_DMMV_X +#define GGML_CUDA_DMMV_X 32 +#endif +#ifndef GGML_CUDA_MMV_Y +#define GGML_CUDA_MMV_Y 1 +#endif + +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 2 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + +#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE +#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128 +#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE + +#define MUL_MAT_SRC1_COL_STRIDE 128 + +#define MAX_STREAMS 8 +static dpct::queue_ptr g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { + {&dpct::get_in_order_queue()}}; + +struct ggml_tensor_extra_gpu { + void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors + dpct::event_ptr + events[GGML_CUDA_MAX_DEVICES] + [MAX_STREAMS]; // events for synchronizing multiple GPUs +}; + +// this is faster on Windows +// probably because the Windows CUDA libraries forget to make this check before invoking the drivers +inline dpct::err0 ggml_cuda_set_device(const int device) try { + int current_device; + CUDA_CHECK(current_device = dpct::dev_mgr::instance().current_device_id()); + + if (device == current_device) { + return 0; + } + + /* + DPCT1093:68: The "device" device may be not the one intended for use. Adjust + the selected device if needed. + */ + return DPCT_CHECK_ERROR(dpct::select_device(device)); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static int g_device_count = -1; +static int g_main_device = 0; +static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; + +struct cuda_device_capabilities { + int cc; // compute capability + bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory +}; + +static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} }; + + +static void * g_scratch_buffer = nullptr; +static size_t g_scratch_size = 0; // disabled by default +static size_t g_scratch_offset = 0; + +static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + +[[noreturn]] +static void bad_arch(const sycl::stream &stream_ct1) { + stream_ct1 << "ERROR: ggml-cuda was compiled without support for the " + "current GPU architecture.\n"; + __trap(); + + (void) bad_arch; // suppress unused function warning +} + +static __dpct_inline__ float warp_reduce_sum(float x, + const sycl::nd_item<3> &item_ct1) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:0: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + /* + DPCT1096:113: The right-most dimension of the work-group used in the + SYCL kernel that calls this function may be less than "32". The function + "dpct::permute_sub_group_by_xor" may return an unexpected result on the + CPU device. Modify the size of the work-group to ensure that the value + of the right-most dimension is a multiple of "32". + */ + x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask); + } + return x; +} + +static __dpct_inline__ sycl::float2 +warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:1: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), + mask); + /* + DPCT1023:2: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), + mask); + } + return a; +} + +static __dpct_inline__ float warp_reduce_max(float x, + const sycl::nd_item<3> &item_ct1) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:3: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + /* + DPCT1096:112: The right-most dimension of the work-group used in the + SYCL kernel that calls this function may be less than "32". The function + "dpct::permute_sub_group_by_xor" may return an unexpected result on the + CPU device. Modify the size of the work-group to ensure that the value + of the right-most dimension is a multiple of "32". + */ + x = sycl::fmax(x, dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), x, mask)); + } + return x; +} + +static __dpct_inline__ float op_repeat(const float a, const float b) { + return b; +} + +static __dpct_inline__ float op_add(const float a, const float b) { + return a + b; +} + +static __dpct_inline__ float op_mul(const float a, const float b) { + return a * b; +} + +static __dpct_inline__ float op_div(const float a, const float b) { + return a / b; +} + +template +static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) / + ne3; + const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) % + ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; + i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); + } +} + +template +static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + const int i3 = i/(ne2*ne1*ne0); + const int i2 = (i/(ne1*ne0)) % ne2; + const int i1 = (i/ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); +} + +static void acc_f32(const float * x, const float * y, float * dst, const int ne, + const int ne10, const int ne11, const int ne12, + const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= ne) { + return; + } + int src1_idx = i - offset; + int oz = src1_idx / nb2; + int oy = (src1_idx - (oz * nb2)) / nb1; + int ox = src1_idx % nb1; + if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { + dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; + } else { + dst[i] = x[i]; + } +} + +static void gelu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + float xi = x[i]; + dst[i] = 0.5f * xi * + (1.0f + + sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi))); +} + +static void silu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i])); +} + +static void gelu_quick_f32(const float *x, float *dst, int k, + const sycl::nd_item<3> &item_ct1) { + const float GELU_QUICK_COEF = -1.702f; + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i]))); +} + +static void tanh_f32(const float *x, float *dst, int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = sycl::tanh((float)(x[i])); +} + +static void relu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::fmax((float)(x[i]), (float)0); +} + +static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = sycl::fmax((float)(x[i]), (float)0) + + sycl::fmin((float)(x[i]), 0.0f) * negative_slope; +} + +static void sqr_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] * x[i]; +} + +template +static void norm_f32(const float * x, float * dst, const int ncols, const float eps, + const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + + sycl::float2 mean_var = sycl::float2(0.f, 0.f); + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row*ncols + col]; + mean_var.x() += xi; + mean_var.y() += xi * xi; + } + + // sum up partial sums + mean_var = warp_reduce_sum(mean_var, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = mean_var; + } + /* + DPCT1118:4: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + mean_var = s_sum[lane_id]; + mean_var = warp_reduce_sum(mean_var, item_ct1); + } + + const float mean = mean_var.x() / ncols; + const float var = mean_var.y() / ncols - mean * mean; + const float inv_std = sycl::rsqrt(var + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; + } +} + +static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (item_ct1.get_group(0) < ne02) { // src0 + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + + (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = y[offset_src]; + } +} + +static void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor, + const sycl::nd_item<3> &item_ct1) { + int ne0 = ne00 * scale_factor; + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int i00 = nidx / scale_factor; + int i01 = item_ct1.get_group(1) / scale_factor; + int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02; + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; +} + +static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (nidx < ne00 && item_ct1.get_group(1) < ne01 && + item_ct1.get_group(0) < ne02) { + int offset_src = nidx + item_ct1.get_group(1) * ne00 + + item_ct1.get_group(0) * ne00 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + dst[offset_dst] = 0.0f; + } +} + +template +static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps, + const sycl::nd_item<3> &item_ct1, float *s_sum) { + int start = item_ct1.get_group(2) * group_size; + int end = start + group_size; + + start += item_ct1.get_local_id(2); + + if (end >= ne_elements) { + end = ne_elements; + } + + float tmp = 0.0f; // partial sum for thread in warp + + for (int j = start; j < end; j += block_size) { + tmp += x[j]; + } + + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:5: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:69: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + float mean = tmp / group_size; + tmp = 0.0f; + + for (int j = start; j < end; j += block_size) { + float xi = x[j] - mean; + dst[j] = xi; + tmp += xi * xi; + } + + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:6: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:70: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + float variance = tmp / group_size; + float scale = sycl::rsqrt(variance + eps); + for (int j = start; j < end; j += block_size) { + dst[j] *= scale; + } +} + +template +static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps, + const sycl::nd_item<3> &item_ct1, float *s_sum) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row*ncols + col]; + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:7: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + const float mean = tmp / ncols; + const float scale = sycl::rsqrt(mean + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = scale * x[row*ncols + col]; + } +} + +static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = x[ib].d; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {8.0f, 8.0f}); + v = __hmul2(v, {d, d}); +#else + v.x() = (v.x() - 8.0f) * d; + v.y() = (v.y() - 8.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const dfloat d = x[ib].dm[1]; + const dfloat m = x[ib].dm[0]; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x() = (v.x() * d) + m; + v.y() = (v.y() * d) + m; +#endif // GGML_CUDA_F16 +} + +static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q5_0 * x = (const block_q5_0 *) vx; + + const dfloat d = x[ib].d; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {16.0f, 16.0f}); + v = __hmul2(v, {d, d}); +#else + v.x() = (v.x() - 16.0f) * d; + v.y() = (v.y() - 16.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q5_1 * x = (const block_q5_1 *) vx; + + const dfloat d = x[ib].dm[1]; + const dfloat m = x[ib].dm[0]; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x() = (v.x() * d) + m; + v.y() = (v.y() * d) + m; +#endif // GGML_CUDA_F16 +} + +static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q8_0 * x = (const block_q8_0 *) vx; + + const dfloat d = x[ib].d; + + v.x() = x[ib].qs[iqs + 0]; + v.y() = x[ib].qs[iqs + 1]; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); +#else + v.x() *= d; + v.y() *= d; +#endif // GGML_CUDA_F16 +} + +//================================== k-quants + +template +static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_q2_K * x = (const block_q2_K *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int n = tid/32; + const int l = tid - 32*n; + const int is = 8*n + l/16; + + const uint8_t q = x[i].qs[32*n + l]; + dst_t * y = yy + i*QK_K + 128*n; + + float dall = x[i].dm[1]; + float dmin = x[i].dm[0]; + y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); + y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); + y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); +#else + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const uint8_t q = x[i].qs[il] >> (2*is); + dst_t * y = yy + i*QK_K + 16*is + il; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); + y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); +#endif + +} + +template +static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_q3_K * x = (const block_q3_K *) vx; + +#if QK_K == 256 + const int r = item_ct1.get_local_id(2) / 4; + const int tid = r/2; + const int is0 = r%2; + const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4); + const int n = tid / 4; + const int j = tid - 4*n; + + uint8_t m = 1 << (4*n + j); + int is = 8*n + 2*j + is0; + int shift = 2*j; + + int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : + (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); + float d_all = x[i].d; + float dl = d_all * (us - 32); + + dst_t * y = yy + i*QK_K + 128*n + 32*j; + const uint8_t * q = x[i].qs + 32*n; + const uint8_t * hm = x[i].hmask; + + for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +#else + const int tid = threadIdx.x; + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const int im = il/8; // 0...1 + const int in = il%8; // 0...7 + + dst_t * y = yy + i*QK_K + 16*is + il; + + const uint8_t q = x[i].qs[il] >> (2*is); + const uint8_t h = x[i].hmask[in] >> (2*is + im); + const float d = (float)x[i].d; + + if (is == 0) { + y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } else { + y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } +#endif + +} + +#if QK_K == 256 +static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { + if (j < 4) { + d = q[j] & 63; m = q[j + 4] & 63; + } else { + d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} +#endif + +template +static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q4_K * x = (const block_q4_K *) vx; + + const int i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; + const int ir = tid%8; + const int is = 2*il; + const int n = 4; + + dst_t * y = yy + i*QK_K + 64*il + n*ir; + + const float dall = x[i].dm[1]; + const float dmin = x[i].dm[0]; + + const uint8_t * q = x[i].qs + 32*il + n*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + for (int l = 0; l < n; ++l) { + y[l + 0] = d1 * (q[l] & 0xF) - m1; + y[l +32] = d2 * (q[l] >> 4) - m2; + } +#else + const int tid = threadIdx.x; + const uint8_t * q = x[i].qs; + dst_t * y = yy + i*QK_K; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); + y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); +#endif +} + +template +static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q5_K * x = (const block_q5_K *) vx; + + const int i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 64 threads - this is very slightly better than the one below + const int tid = item_ct1.get_local_id(2); + const int il = tid/16; // il is in 0...3 + const int ir = tid%16; // ir is in 0...15 + const int is = 2*il; // is is in 0...6 + + dst_t * y = yy + i*QK_K + 64*il + 2*ir; + + const float dall = x[i].dm[1]; + const float dmin = x[i].dm[0]; + + const uint8_t * ql = x[i].qs + 32*il + 2*ir; + const uint8_t * qh = x[i].qh + 2*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + + uint8_t hm = 1 << (2*il); + y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; + y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; +#else + const int tid = threadIdx.x; + const uint8_t q = x[i].qs[tid]; + const int im = tid/8; // 0...3 + const int in = tid%8; // 0...7 + const int is = tid/16; // 0 or 1 + const uint8_t h = x[i].qh[in] >> im; + const float d = x[i].d; + dst_t * y = yy + i*QK_K + tid; + y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); + y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); +#endif +} + +template +static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q6_K * x = (const block_q6_K *) vx; + + const int i = item_ct1.get_group(2); +#if QK_K == 256 + + // assume 64 threads - this is very slightly better than the one below + const int tid = item_ct1.get_local_id(2); + const int ip = tid/32; // ip is 0 or 1 + const int il = tid - 32*ip; // 0...32 + const int is = 8*ip + il/16; + + dst_t * y = yy + i*QK_K + 128*ip + il; + + const float d = x[i].d; + + const uint8_t * ql = x[i].ql + 64*ip + il; + const uint8_t qh = x[i].qh[32*ip + il]; + const int8_t * sc = x[i].scales + is; + + y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +#else + + // assume 32 threads + const int tid = threadIdx.x; + const int ip = tid/16; // 0 or 1 + const int il = tid - 16*ip; // 0...15 + + dst_t * y = yy + i*QK_K + 16*ip + il; + + const float d = x[i].d; + + const uint8_t ql = x[i].ql[16*ip + il]; + const uint8_t qh = x[i].qh[il] >> (2*ip); + const int8_t * sc = x[i].scales; + + y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); +#endif +} + +/* +DPCT1110:8: The total declared local variable size in device function +dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q2_K * x = (const block_q2_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 16/K_QUANTS_PER_ITERATION; + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = x[i].dm[1]; + const float dmin = x[i].dm[0]; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; + + uint32_t uaux[2]; + const uint8_t * d = (const uint8_t *)uaux; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint32_t * s = (const uint32_t *)x[i].scales; + + uaux[0] = s[0] & 0x0f0f0f0f; + uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; + + const float2 dall = __half22float2(x[i].dm); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t ql = q[l]; + sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + + y[l+16] * d[1] * ((ql >> 2) & 3) + + y[l+32] * d[2] * ((ql >> 4) & 3) + + y[l+48] * d[3] * ((ql >> 6) & 3); + sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; + } + tmp += dall.x * sum1 - dall.y * sum2; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:9: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:10: The total declared local variable size in device function +dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint8_t * s = x[i].scales; + + const float dall = (float)x[i].d; + + float sum = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t hl = x[i].hmask[im+l] >> in; + const uint8_t ql = q[l]; + sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) + + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) + + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) + + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:11: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:12: The total declared local variable size in device function +dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[1]; + const float dmin = x[i].dm[0]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; + s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - + dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif + + } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + + const int step = tid * K_QUANTS_PER_ITERATION; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + float tmp = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const float * y = yy + i*QK_K + step; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) + + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); + } + tmp += sum; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:13: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:14: The total declared local variable size in device function +dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = item_ct1.get_local_id(2) / 2; // 0...15 + const int ix = item_ct1.get_local_id(2) % 2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[1]; + const float dmin = x[i].dm[0]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x() += + y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + + y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); + sum.y() += + y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + + y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); + sum.z() += + y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + + y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); + sum.w() += + y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + + y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + + sum.w() * sc[5]) - + dmin * smin; + } + +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:15: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + +#if QK_K == 256 + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3 + + const int step = tid * K_QUANTS_PER_ITERATION; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + step; + const uint8_t * ql = x[i].ql + step; + const uint8_t * qh = x[i].qh + step; + const int8_t * s = x[i].scales; + + const float d = x[i+0].d; + + float sum = 0; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) + + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); + } + tmp += sum; + + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:16: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const sycl::half *x = (const sycl::half *)vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const float * x = (const float *) vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, + const sycl::nd_item<3> &item_ct1) { + const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (ix >= kx_padded) { + return; + } + + const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + + const int i_padded = iy*kx_padded + ix; + + block_q8_1 * y = (block_q8_1 *) vy; + + const int ib = i_padded / QK8_1; // block index + const int iqs = i_padded % QK8_1; // quant index + + const float xi = ix < kx ? x[iy*kx + ix] : 0.0f; + float amax = sycl::fabs((float)xi); + float sum = xi; + +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:17: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), amax, mask)); + /* + DPCT1023:18: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + sum += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask); + } + + const float d = amax / 127; + const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d); + + y[ib].qs[iqs] = q; + + if (iqs > 0) { + return; + } + + reinterpret_cast(y[ib].ds.x()) = d; + reinterpret_cast(y[ib].ds.y()) = sum; +} + +template +static void k_get_rows( + const void * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { + + const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2)) * + 2; + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) / + ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) % + ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; + + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index + const int iybs = i00 - i00%qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(src0_row, ib, iqs, v); + + dst_row[iybs + iqs + 0] = v.x(); + dst_row[iybs + iqs + y_offset] = v.y(); +} + +template +static void k_get_rows_float( + const src0_t * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { + + const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) / + ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) % + ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); + + dst_row[i00] = src0_row[i00]; +} + +template +static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + 2 * item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + const int ib = i/qk; // block index + const int iqs = (i%qk)/qr; // quant index + const int iybs = i - i%qk; // y block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(vx, ib, iqs, v); + + y[iybs + iqs + 0] = v.x(); + y[iybs + iqs + y_offset] = v.y(); +} + +// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called +// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q + +#define VDR_Q4_0_Q8_1_MMVQ 2 +#define VDR_Q4_0_Q8_1_MMQ 4 + +template +static __dpct_inline__ float +vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4, + const sycl::half2 &ds8, const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + + const float2 ds8f = __half22float2(ds8); + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_1_Q8_1_MMVQ 2 +#define VDR_Q4_1_Q8_1_MMQ 4 + +template +static __dpct_inline__ float +vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4, + const sycl::half2 &ds8, const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm4, ds8)); + const float d4d8 = tmp.x; + const float m4s8 = tmp.y; +#else + const float2 dm4f = __half22float2(dm4); + const float2 ds8f = __half22float2(ds8); + const float d4d8 = dm4f.x * ds8f.x; + const float m4s8 = dm4f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_0_Q8_1_MMVQ 2 +#define VDR_Q5_0_Q8_1_MMQ 4 + +template +static __dpct_inline__ float +vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, + const float &d5, const sycl::half2 &ds8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + + const float2 ds8f = __half22float2(ds8); + + // second part effectively subtracts 16 from each quant value + return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_1_Q8_1_MMVQ 2 +#define VDR_Q5_1_Q8_1_MMQ 4 + +template +static __dpct_inline__ float +vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, + const sycl::half2 &dm5, const sycl::half2 &ds8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm5, ds8)); + const float d5d8 = tmp.x; + const float m5s8 = tmp.y; +#else + const float2 dm5f = __half22float2(dm5); + const float2 ds8f = __half22float2(ds8); + const float d5d8 = dm5f.x * ds8f.x; + const float m5s8 = dm5f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it + return sumi*d5d8 + m5s8 / (QI5_1 / vdr); + +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q8_0_Q8_1_MMVQ 2 +#define VDR_Q8_0_Q8_1_MMQ 8 + +template +static __dpct_inline__ float +vec_dot_q8_0_q8_1_impl(const int *v, const int *u, const float &d8_0, + const float &d8_1, const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + + return d8_0*d8_1 * sumi; +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +template +static __dpct_inline__ float +vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8, + const sycl::half2 &ds8, const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm8, ds8)); + const float d8d8 = tmp.x; + const float m8s8 = tmp.y; +#else + const float2 dm8f = __half22float2(dm8); + const float2 ds8f = __half22float2(ds8); + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it + return sumi*d8d8 + m8s8 / (QI8_1 / vdr); +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q2_K_Q8_1_MMVQ 1 +#define VDR_Q2_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( + const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales, + const sycl::half2 &dm2, const float *__restrict__ d8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + } + + const float2 dm2f = __half22float2(dm2); + + return dm2f.x*sumf_d - dm2f.y*sumf_m; +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ scales, + const sycl::half2 &dm2, const float &d8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi_d = 0; + int sumi_m = 0; + +#pragma unroll + for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { + int sumi_d_sc = 0; + + const int sc = scales[i0 / (QI8_1/2)]; + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + +#pragma unroll + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m + } + + sumi_d += sumi_d_sc * (sc & 0xF); + } + + const float2 dm2f = __half22float2(dm2); + + return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q3_K_Q8_1_MMVQ 1 +#define VDR_Q3_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( + const int &vl, const int &vh, const int *__restrict__ u, + const uint8_t *__restrict__ scales, const int &scale_offset, + const float &d3, const float *__restrict__ d8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = __vsubss4(vil, vih); + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d3 * sumf; +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d3, + const float &d8, const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { + int sumi_sc = 0; + + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product + } + + sumi += sumi_sc * scales[i0 / (QI8_1/2)]; + } + + return d3*d8 * sumi; +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_K_Q8_1_MMVQ 2 +#define VDR_Q4_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const float *__restrict__ d8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_K_Q8_1_MMVQ 2 +#define VDR_Q5_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( + const int *__restrict__ vl, const int *__restrict__ vh, + const int *__restrict__ u, const uint8_t *__restrict__ sc, + const uint8_t *__restrict__ m, const sycl::half2 &dm5, + const float *__restrict__ d8, const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; + + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; + + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; + + const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); + + } + + const float2 dm5f = __half22float2(dm5); + + return dm5f.x*sumf_d - dm5f.y*sumf_m; + +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q6_K_Q8_1_MMVQ 1 +#define VDR_Q6_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq( + const int &vl, const int &vh, const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d, + const float *__restrict__ d8, const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const int8_t *__restrict__ sc, const float &d6, + const float *__restrict__ d8, + const sycl::stream &stream_ct1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { + int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + +#pragma unroll + for (int i = i0; i < i0 + 2; ++i) { + sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product + sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product + + sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product + sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product + } + + sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); + } + + return d6 * sumf_d; + +#else + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __dpct_inline__ float +vec_dot_q4_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + + const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; + + int v[VDR_Q4_0_Q8_1_MMVQ]; + int u[2*VDR_Q4_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + } + + return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds, + stream_ct1); +} + +template +static __dpct_inline__ void +allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs, float *tile_x_d) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs; + *x_dm = (sycl::half2 *)tile_x_d; +} + +template +static __dpct_inline__ void +load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_0; + const int kqsx = k % QI4_0; + + const block_q4_0 * bx0 = (const block_q4_0 *) vx; + + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const float * x_dmf = (const float *) x_dm; + + int u[2*VDR_Q4_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + } + + return vec_dot_q4_0_q8_1_impl( + &x_ql[i * (WARP_SIZE + 1) + k], u, + x_dmf[i * (WARP_SIZE / QI4_0) + i / QI4_0 + k / QI4_0], + y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)], + stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q4_1_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + + const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; + + int v[VDR_Q4_1_Q8_1_MMVQ]; + int u[2*VDR_Q4_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + } + + return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, + bq8_1->ds, stream_ct1); +} + +template +static __dpct_inline__ void +allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs, sycl::half2 *tile_x_dm) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs; + *x_dm = tile_x_dm; +} + +template +static __dpct_inline__ void +load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_1; + const int kqsx = k % QI4_1; + + const block_q4_1 * bx0 = (const block_q4_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { + int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + } +} + +static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + + int u[2*VDR_Q4_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + } + + return vec_dot_q4_1_q8_1_impl( + &x_ql[i * (WARP_SIZE + 1) + k], u, + x_dm[i * (WARP_SIZE / QI4_1) + i / QI4_1 + k / QI4_1], + y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)], + stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q5_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + + const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; + + int vl[VDR_Q5_0_Q8_1_MMVQ]; + int vh[VDR_Q5_0_Q8_1_MMVQ]; + int u[2*VDR_Q5_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); + vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + } + + return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, + bq8_1->ds, stream_ct1); +} + +template +static __dpct_inline__ void +allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, float *tile_x_d) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_ql; + *x_dm = (sycl::half2 *)tile_x_d; +} + +template +static __dpct_inline__ void +load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_0; + const int kqsx = k % QI5_0; + + const block_q5_0 * bx0 = (const block_q5_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8(bxi->qs, kqsx); + const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + qs0 = dpct::vectorized_binary( + qs0, 0x10101010, dpct::sub_sat()); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + qs1 = dpct::vectorized_binary( + qs1, 0x10101010, dpct::sub_sat()); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { + int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + int u[2*VDR_Q5_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + } + + return vec_dot_q8_0_q8_1_impl( + &x_ql[i * (2 * WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], + y_df[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)], + stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q5_1_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + + const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; + + int vl[VDR_Q5_1_Q8_1_MMVQ]; + int vh[VDR_Q5_1_Q8_1_MMVQ]; + int u[2*VDR_Q5_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); + vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + } + + return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, + bq8_1->ds, stream_ct1); +} + +template +static __dpct_inline__ void +allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; +} + +template +static __dpct_inline__ void +load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_1; + const int kqsx = k % QI5_1; + + const block_q5_1 * bx0 = (const block_q5_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { + int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + } +} + +static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; + + int u[2*VDR_Q5_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + } + + return vec_dot_q8_1_q8_1_impl( + &x_ql[i * (2 * WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], + y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)], + stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q8_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + + const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; + + int v[VDR_Q8_0_Q8_1_MMVQ]; + int u[VDR_Q8_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_int8(bq8_0->qs, iqs + i); + u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + } + + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, + bq8_1->ds[1], stream_ct1); +} + +template +static __dpct_inline__ void +allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs, float *tile_x_d) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs; + *x_dm = (sycl::half2 *)tile_x_d; +} + +template +static __dpct_inline__ void +load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI8_0; + const int kqsx = k % QI8_0; + float * x_dmf = (float *) x_dm; + + const block_q8_0 * bx0 = (const block_q8_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; (void)x_sc; + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + return vec_dot_q8_0_q8_1_impl( + &x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], + x_dmf[i * (WARP_SIZE / QI8_0) + i / QI8_0 + k / QI8_0], + y_df[j * (WARP_SIZE / QI8_1) + k / QI8_1], stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q2_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const uint8_t * scales = bq2_K->scales + scale_offset; + + const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + int u[QR2_K]; + float d8[QR2_K]; + +#pragma unroll + for (int i = 0; i < QR2_K; ++ i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds[1]; + } + + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8, stream_ct1); +} + +template +static __dpct_inline__ void +allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) { + (void)x_qh; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template +static __dpct_inline__ void +load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI2_K; + const int kqsx = k % QI2_K; + + const block_q2_K * bx0 = (const block_q2_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + } +} + +static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; + + const int kbx = k / QI2_K; + const int ky = (k % QI2_K) * QR2_K; + const float * y_df = (const float *) y_ds; + + int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; + + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); + +#pragma unroll + for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { + v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; + } + + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; + return vec_dot_q2_K_q8_1_impl_mmq( + v, &y_qs[index_y], scales, + x_dm[i * (WARP_SIZE / QI2_K) + i / QI2_K + kbx], y_df[index_y / QI8_1], + stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q3_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const float d = bq3_K->d; + + const int vl = get_int_from_uint8(bq3_K->qs, iqs); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds[1]; + } + + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, + d, d8, stream_ct1); +} + +template +static __dpct_inline__ void +allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh, + int *tile_x_sc) { + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_qh = tile_x_qh; + *x_sc = tile_x_sc; +} + +template +static __dpct_inline__ void +load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI3_K; + const int kqsx = k % QI3_K; + + const block_q3_K * bx0 = (const block_q3_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { + int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + + const int ksc = k % (QI3_K/4); + + const int ksc_low = ksc % (QI3_K/8); + const int shift_low = 4 * (ksc / (QI3_K/8)); + const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; + + const int ksc_high = QI3_K/8; + const int shift_high = 2 * ksc; + const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; + + const int sc = dpct::vectorized_binary( + sc_low | sc_high, 0x20202020, dpct::sub_sat()); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; + } +} + +static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + + const int kbx = k / QI3_K; + const int ky = (k % QI3_K) * QR3_K; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + + int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int shift = 2 * ((ky % 32) / 8); + const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; + + const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vlh = (vh << 2) & 0x04040404; + + v[l] = dpct::vectorized_binary(vll, vlh, dpct::sub_sat()); + } + + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; + return vec_dot_q3_K_q8_1_impl_mmq( + v, &y_qs[index_y], scales, + x_dmf[i * (WARP_SIZE / QI3_K) + i / QI3_K + kbx], y_df[index_y / QI8_1], + stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q4_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + +#ifndef GGML_QKK_64 + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + + // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); + + // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 + // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 + // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 + // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 + + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + v[0] = q4[0]; + v[1] = q4[4]; + + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int i = 0; i < QR4_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = bq8i->ds[1]; + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8, stream_ct1); + +#else + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + const uint16_t * a = (const uint16_t *)bq4_K->scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; + + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * q4 = (const int *)bq4_K->qs + (iqs/2); + const int v1 = q4[0]; + const int v2 = q4[4]; + + const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); + const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); + const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); + + sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); + sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + + return dall * sumf_d - dmin * sumf_m; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +template +static __dpct_inline__ void +allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) { + (void)x_qh; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template +static __dpct_inline__ void +load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_K; // == 0 if QK_K == 256 + const int kqsx = k % QI4_K; // == k if QK_K == 256 + + const block_q4_K * bx0 = (const block_q4_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); + + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], + &y_qs[index_y], sc, sc + 8, + x_dm[i * (WARP_SIZE / QI4_K) + i / QI4_K], + &y_ds[index_y / QI8_1], stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q5_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + +#ifndef GGML_QKK_64 + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; + + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); + + vl[0] = ql[0]; + vl[1] = ql[4]; + + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = bq8i->ds[0]; + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8, + stream_ct1); + +#else + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int8_t * s = bq5_K->scales; + + const float d = bq5_K->d; + + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * ql = (const int *)bq5_K->qs + (iqs/2); + const int vl1 = ql[0]; + const int vl2 = ql[4]; + + const int step = 4 * (iqs/2); // 0, 4, 8, 12 + const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 + const int in = step%8; // 0, 4, 0, 4 + const int vh = (*((const int *)(bq5_K->qh + in))) >> im; + + const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); + const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); + const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); + const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); + + const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1]) + + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]); + + return d * sumf_d; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +template +static __dpct_inline__ void +allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) { + (void)x_qh; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template +static __dpct_inline__ void +load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_K; // == 0 if QK_K == 256 + const int kqsx = k % QI5_K; // == k if QK_K == 256 + + const block_q5_K * bx0 = (const block_q5_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR5_K*kqsx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); + const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; + const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; + + const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; + const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); + + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, + sc + 8, + x_dm[i * (WARP_SIZE / QI5_K) + i / QI5_K], + &y_ds[index_y / QI8_1], stream_ct1); +} + +static __dpct_inline__ float +vec_dot_q6_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const sycl::stream &stream_ct1) { + + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + const int vl = get_int_from_uint8(bq6_K->ql, iqs); + const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; + + const int8_t * scales = bq6_K->scales + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + 2 * i].ds[1]; + } + + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8, + stream_ct1); +} + +template +static __dpct_inline__ void +allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) { + (void)x_qh; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template +static __dpct_inline__ void +load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + const block_q6_K * bx0 = (const block_q6_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR6_K*kqsx; + + const int ql = get_int_from_uint8(bxi->ql, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); + const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; + const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; + + const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; + const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); + + x_ql[i * (2 * WARP_SIZE + 1) + kq0] = + dpct::vectorized_binary(ql0 | qh0, 0x20202020, + dpct::sub_sat()); + x_ql[i * (2 * WARP_SIZE + 1) + kq1] = + dpct::vectorized_binary(ql1 | qh1, 0x20202020, + dpct::sub_sat()); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + } +} + +static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + (void)x_qh; + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); + + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; + return vec_dot_q6_K_q8_1_impl_mmq( + &x_ql[index_x], &y_qs[index_y], sc, + x_dmf[i * (WARP_SIZE / QI6_K) + i / QI6_K], &y_df[index_y / QI8_1], + stream_ct1); +} + +template +/* +DPCT1110:19: The total declared local variable size in device function mul_mat_q +exceeds 128 bytes and may cause high register pressure. Consult with your +hardware vendor to find the total register size available and adjust the code, +or use smaller sub-group size to avoid high register pressure. +*/ +static __dpct_inline__ void +mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, + float *__restrict__ dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + const int blocks_per_warp = WARP_SIZE / qi; + + const int & ncols_dst = ncols_y; + + const int row_dst_0 = item_ct1.get_group(2) * mmq_y; + const int & row_x_0 = row_dst_0; + + const int col_dst_0 = item_ct1.get_group(1) * mmq_x; + const int & col_y_0 = col_dst_0; + + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + + allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; + + for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { + + load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, + tile_x_qh, tile_x_sc, item_ct1.get_local_id(1), + nrows_x - row_x_0 - 1, item_ct1.get_local_id(2), + blocks_per_row_x); + +#pragma unroll + for (int ir = 0; ir < qr; ++ir) { + const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2); + const int kbxd = kqs / QI8_1; + +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = dpct::min( + (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i), + ncols_y - 1); // to prevent out-of-bounds memory accesses + + const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; + + const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE + + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned( + by0->qs, item_ct1.get_local_id(2) % QI8_1); + } + +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = + (ids0 + item_ct1.get_local_id(1) * QI8_1 + + item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) % + mmq_x; + const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1); + const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const sycl::half2 *dsi_src = + &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) + + ir * (WARP_SIZE / QI8_1) + kby] + .ds; + sycl::half2 *dsi_dst = + &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = (*dsi_src)[1]; + } + } + + /* + DPCT1118:20: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:71: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i / WARP_SIZE][j / nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, + tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i, + item_ct1.get_local_id(1) + j, k); + } + } + } + + /* + DPCT1118:21: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:72: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + } + } + +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1); + + if (col_dst >= ncols_dst) { + return; + } + +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i; + + if (row_dst >= nrows_dst) { + continue; + } + + dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; + } + } +} + +#define MMQ_X_Q4_0_RDNA2 64 +#define MMQ_Y_Q4_0_RDNA2 128 +#define NWARPS_Q4_0_RDNA2 8 +#define MMQ_X_Q4_0_RDNA1 64 +#define MMQ_Y_Q4_0_RDNA1 64 +#define NWARPS_Q4_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_0_AMPERE 4 +#define MMQ_Y_Q4_0_AMPERE 32 +#define NWARPS_Q4_0_AMPERE 4 +#else +#define MMQ_X_Q4_0_AMPERE 64 +#define MMQ_Y_Q4_0_AMPERE 128 +#define NWARPS_Q4_0_AMPERE 4 +#endif +#define MMQ_X_Q4_0_PASCAL 64 +#define MMQ_Y_Q4_0_PASCAL 64 +#define NWARPS_Q4_0_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q4_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_0_RDNA2; + const int mmq_y = MMQ_Y_Q4_0_RDNA2; + const int nwarps = NWARPS_Q4_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_0_RDNA1; + const int mmq_y = MMQ_Y_Q4_0_RDNA1; + const int nwarps = NWARPS_Q4_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_0_AMPERE; + const int mmq_y = MMQ_Y_Q4_0_AMPERE; + const int nwarps = NWARPS_Q4_0_AMPERE; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_0_PASCAL; + const int mmq_y = MMQ_Y_Q4_0_PASCAL; + const int nwarps = NWARPS_Q4_0_PASCAL; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_0_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q4_1_RDNA2 64 +#define MMQ_Y_Q4_1_RDNA2 128 +#define NWARPS_Q4_1_RDNA2 8 +#define MMQ_X_Q4_1_RDNA1 64 +#define MMQ_Y_Q4_1_RDNA1 64 +#define NWARPS_Q4_1_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_1_AMPERE 4 +#define MMQ_Y_Q4_1_AMPERE 32 +#define NWARPS_Q4_1_AMPERE 4 +#else +#define MMQ_X_Q4_1_AMPERE 64 +#define MMQ_Y_Q4_1_AMPERE 128 +#define NWARPS_Q4_1_AMPERE 4 +#endif +#define MMQ_X_Q4_1_PASCAL 64 +#define MMQ_Y_Q4_1_PASCAL 64 +#define NWARPS_Q4_1_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA + +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q4_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_1_RDNA2; + const int mmq_y = MMQ_Y_Q4_1_RDNA2; + const int nwarps = NWARPS_Q4_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_1_RDNA1; + const int mmq_y = MMQ_Y_Q4_1_RDNA1; + const int nwarps = NWARPS_Q4_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_1_AMPERE; + const int mmq_y = MMQ_Y_Q4_1_AMPERE; + const int nwarps = NWARPS_Q4_1_AMPERE; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_1_PASCAL; + const int mmq_y = MMQ_Y_Q4_1_PASCAL; + const int nwarps = NWARPS_Q4_1_PASCAL; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_1_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_0_RDNA2 64 +#define MMQ_Y_Q5_0_RDNA2 128 +#define NWARPS_Q5_0_RDNA2 8 +#define MMQ_X_Q5_0_RDNA1 64 +#define MMQ_Y_Q5_0_RDNA1 64 +#define NWARPS_Q5_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_0_AMPERE 4 +#define MMQ_Y_Q5_0_AMPERE 32 +#define NWARPS_Q5_0_AMPERE 4 +#else +#define MMQ_X_Q5_0_AMPERE 128 +#define MMQ_Y_Q5_0_AMPERE 64 +#define NWARPS_Q5_0_AMPERE 4 +#endif +#define MMQ_X_Q5_0_PASCAL 64 +#define MMQ_Y_Q5_0_PASCAL 64 +#define NWARPS_Q5_0_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q5_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_0_RDNA2; + const int mmq_y = MMQ_Y_Q5_0_RDNA2; + const int nwarps = NWARPS_Q5_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_0_RDNA1; + const int mmq_y = MMQ_Y_Q5_0_RDNA1; + const int nwarps = NWARPS_Q5_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_0_AMPERE; + const int mmq_y = MMQ_Y_Q5_0_AMPERE; + const int nwarps = NWARPS_Q5_0_AMPERE; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_0_PASCAL; + const int mmq_y = MMQ_Y_Q5_0_PASCAL; + const int nwarps = NWARPS_Q5_0_PASCAL; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_0_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_1_RDNA2 64 +#define MMQ_Y_Q5_1_RDNA2 128 +#define NWARPS_Q5_1_RDNA2 8 +#define MMQ_X_Q5_1_RDNA1 64 +#define MMQ_Y_Q5_1_RDNA1 64 +#define NWARPS_Q5_1_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_1_AMPERE 4 +#define MMQ_Y_Q5_1_AMPERE 32 +#define NWARPS_Q5_1_AMPERE 4 +#else +#define MMQ_X_Q5_1_AMPERE 128 +#define MMQ_Y_Q5_1_AMPERE 64 +#define NWARPS_Q5_1_AMPERE 4 +#endif +#define MMQ_X_Q5_1_PASCAL 64 +#define MMQ_Y_Q5_1_PASCAL 64 +#define NWARPS_Q5_1_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_1_RDNA2; + const int mmq_y = MMQ_Y_Q5_1_RDNA2; + const int nwarps = NWARPS_Q5_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_1_RDNA1; + const int mmq_y = MMQ_Y_Q5_1_RDNA1; + const int nwarps = NWARPS_Q5_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_1_AMPERE; + const int mmq_y = MMQ_Y_Q5_1_AMPERE; + const int nwarps = NWARPS_Q5_1_AMPERE; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_1_PASCAL; + const int mmq_y = MMQ_Y_Q5_1_PASCAL; + const int nwarps = NWARPS_Q5_1_PASCAL; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_1_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q8_0_RDNA2 64 +#define MMQ_Y_Q8_0_RDNA2 128 +#define NWARPS_Q8_0_RDNA2 8 +#define MMQ_X_Q8_0_RDNA1 64 +#define MMQ_Y_Q8_0_RDNA1 64 +#define NWARPS_Q8_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q8_0_AMPERE 4 +#define MMQ_Y_Q8_0_AMPERE 32 +#define NWARPS_Q8_0_AMPERE 4 +#else +#define MMQ_X_Q8_0_AMPERE 128 +#define MMQ_Y_Q8_0_AMPERE 64 +#define NWARPS_Q8_0_AMPERE 4 +#endif +#define MMQ_X_Q8_0_PASCAL 64 +#define MMQ_Y_Q8_0_PASCAL 64 +#define NWARPS_Q8_0_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q8_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q8_0_RDNA2; + const int mmq_y = MMQ_Y_Q8_0_RDNA2; + const int nwarps = NWARPS_Q8_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q8_0_RDNA1; + const int mmq_y = MMQ_Y_Q8_0_RDNA1; + const int nwarps = NWARPS_Q8_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q8_0_AMPERE; + const int mmq_y = MMQ_Y_Q8_0_AMPERE; + const int nwarps = NWARPS_Q8_0_AMPERE; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q8_0_PASCAL; + const int mmq_y = MMQ_Y_Q8_0_PASCAL; + const int nwarps = NWARPS_Q8_0_PASCAL; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q8_0_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q2_K_RDNA2 64 +#define MMQ_Y_Q2_K_RDNA2 128 +#define NWARPS_Q2_K_RDNA2 8 +#define MMQ_X_Q2_K_RDNA1 128 +#define MMQ_Y_Q2_K_RDNA1 32 +#define NWARPS_Q2_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q2_K_AMPERE 4 +#define MMQ_Y_Q2_K_AMPERE 32 +#define NWARPS_Q2_K_AMPERE 4 +#else +#define MMQ_X_Q2_K_AMPERE 64 +#define MMQ_Y_Q2_K_AMPERE 128 +#define NWARPS_Q2_K_AMPERE 4 +#endif +#define MMQ_X_Q2_K_PASCAL 64 +#define MMQ_Y_Q2_K_PASCAL 64 +#define NWARPS_Q2_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q2_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q2_K_RDNA2; + const int mmq_y = MMQ_Y_Q2_K_RDNA2; + const int nwarps = NWARPS_Q2_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q2_K_RDNA1; + const int mmq_y = MMQ_Y_Q2_K_RDNA1; + const int nwarps = NWARPS_Q2_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q2_K_AMPERE; + const int mmq_y = MMQ_Y_Q2_K_AMPERE; + const int nwarps = NWARPS_Q2_K_AMPERE; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q2_K_PASCAL; + const int mmq_y = MMQ_Y_Q2_K_PASCAL; + const int nwarps = NWARPS_Q2_K_PASCAL; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q2_K_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q3_K_RDNA2 128 +#define MMQ_Y_Q3_K_RDNA2 64 +#define NWARPS_Q3_K_RDNA2 8 +#define MMQ_X_Q3_K_RDNA1 32 +#define MMQ_Y_Q3_K_RDNA1 128 +#define NWARPS_Q3_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q3_K_AMPERE 4 +#define MMQ_Y_Q3_K_AMPERE 32 +#define NWARPS_Q3_K_AMPERE 4 +#else +#define MMQ_X_Q3_K_AMPERE 128 +#define MMQ_Y_Q3_K_AMPERE 128 +#define NWARPS_Q3_K_AMPERE 4 +#endif +#define MMQ_X_Q3_K_PASCAL 64 +#define MMQ_Y_Q3_K_PASCAL 64 +#define NWARPS_Q3_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA + +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q3_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q3_K_RDNA2; + const int mmq_y = MMQ_Y_Q3_K_RDNA2; + const int nwarps = NWARPS_Q3_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q3_K_RDNA1; + const int mmq_y = MMQ_Y_Q3_K_RDNA1; + const int nwarps = NWARPS_Q3_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q3_K_AMPERE; + const int mmq_y = MMQ_Y_Q3_K_AMPERE; + const int nwarps = NWARPS_Q3_K_AMPERE; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q3_K_PASCAL; + const int mmq_y = MMQ_Y_Q3_K_PASCAL; + const int nwarps = NWARPS_Q3_K_PASCAL; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q3_K_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q4_K_RDNA2 64 +#define MMQ_Y_Q4_K_RDNA2 128 +#define NWARPS_Q4_K_RDNA2 8 +#define MMQ_X_Q4_K_RDNA1 32 +#define MMQ_Y_Q4_K_RDNA1 64 +#define NWARPS_Q4_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_K_AMPERE 4 +#define MMQ_Y_Q4_K_AMPERE 32 +#define NWARPS_Q4_K_AMPERE 4 +#else +#define MMQ_X_Q4_K_AMPERE 64 +#define MMQ_Y_Q4_K_AMPERE 128 +#define NWARPS_Q4_K_AMPERE 4 +#endif +#define MMQ_X_Q4_K_PASCAL 64 +#define MMQ_Y_Q4_K_PASCAL 64 +#define NWARPS_Q4_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA + +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q4_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_K_RDNA2; + const int mmq_y = MMQ_Y_Q4_K_RDNA2; + const int nwarps = NWARPS_Q4_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_K_RDNA1; + const int mmq_y = MMQ_Y_Q4_K_RDNA1; + const int nwarps = NWARPS_Q4_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_K_AMPERE; + const int mmq_y = MMQ_Y_Q4_K_AMPERE; + const int nwarps = NWARPS_Q4_K_AMPERE; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_K_PASCAL; + const int mmq_y = MMQ_Y_Q4_K_PASCAL; + const int nwarps = NWARPS_Q4_K_PASCAL; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_K_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_K_RDNA2 64 +#define MMQ_Y_Q5_K_RDNA2 128 +#define NWARPS_Q5_K_RDNA2 8 +#define MMQ_X_Q5_K_RDNA1 32 +#define MMQ_Y_Q5_K_RDNA1 64 +#define NWARPS_Q5_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_K_AMPERE 4 +#define MMQ_Y_Q5_K_AMPERE 32 +#define NWARPS_Q5_K_AMPERE 4 +#else +#define MMQ_X_Q5_K_AMPERE 64 +#define MMQ_Y_Q5_K_AMPERE 128 +#define NWARPS_Q5_K_AMPERE 4 +#endif +#define MMQ_X_Q5_K_PASCAL 64 +#define MMQ_Y_Q5_K_PASCAL 64 +#define NWARPS_Q5_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_K_RDNA2; + const int mmq_y = MMQ_Y_Q5_K_RDNA2; + const int nwarps = NWARPS_Q5_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_K_RDNA1; + const int mmq_y = MMQ_Y_Q5_K_RDNA1; + const int nwarps = NWARPS_Q5_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_K_AMPERE; + const int mmq_y = MMQ_Y_Q5_K_AMPERE; + const int nwarps = NWARPS_Q5_K_AMPERE; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_K_PASCAL; + const int mmq_y = MMQ_Y_Q5_K_PASCAL; + const int nwarps = NWARPS_Q5_K_PASCAL; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_K_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q6_K_RDNA2 64 +#define MMQ_Y_Q6_K_RDNA2 128 +#define NWARPS_Q6_K_RDNA2 8 +#define MMQ_X_Q6_K_RDNA1 32 +#define MMQ_Y_Q6_K_RDNA1 64 +#define NWARPS_Q6_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q6_K_AMPERE 4 +#define MMQ_Y_Q6_K_AMPERE 32 +#define NWARPS_Q6_K_AMPERE 4 +#else +#define MMQ_X_Q6_K_AMPERE 64 +#define MMQ_Y_Q6_K_AMPERE 64 +#define NWARPS_Q6_K_AMPERE 4 +#endif +#define MMQ_X_Q6_K_PASCAL 64 +#define MMQ_Y_Q6_K_PASCAL 64 +#define NWARPS_Q6_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA + +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q6_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::stream &stream_ct1) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q6_K_RDNA2; + const int mmq_y = MMQ_Y_Q6_K_RDNA2; + const int nwarps = NWARPS_Q6_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q6_K_RDNA1; + const int mmq_y = MMQ_Y_Q6_K_RDNA1; + const int nwarps = NWARPS_Q6_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q6_K_AMPERE; + const int mmq_y = MMQ_Y_Q6_K_AMPERE; + const int nwarps = NWARPS_Q6_K_AMPERE; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q6_K_PASCAL; + const int mmq_y = MMQ_Y_Q6_K_PASCAL; + const int nwarps = NWARPS_Q6_K_PASCAL; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q6_K_q8_1_mul_mat; + bad_arch(stream_ct1); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +template +static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1, + const sycl::stream &stream_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { + const int ibx = row * blocks_per_row + i + + item_ct1.get_local_id(2) / (qi / vdr); // x block index + + const int iby = (i + item_ct1.get_local_id(2) / (qi / vdr)) * + (qk / QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs, stream_ct1); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:22: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int tid = item_ct1.get_local_id(2); + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_CUDA_F16 + half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_CUDA_F16 + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_CUDA_F16 + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); +#else + tmp += v.x() * y[iybs + iqs + j / qr + 0]; + tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; +#endif // GGML_CUDA_F16 + } + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:23: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { +#ifdef GGML_CUDA_F16 + dst[row] = tmp.x + tmp.y; +#else + dst[row] = tmp; +#endif // GGML_CUDA_F16 + } +} + +static void mul_mat_p021_f16_f32( + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, + const sycl::nd_item<3> &item_ct1) { + + const sycl::half *x = (const sycl::half *)vx; + + const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0); + const int channel_x = channel / (nchannels_y / nchannels_x); + + const int nrows_y = ncols_x; + const int nrows_dst = nrows_x; + const int row_dst = row_x; + + float tmp = 0.0f; + + for (int col_x0 = 0; col_x0 < ncols_x; + col_x0 += item_ct1.get_local_range(2)) { + const int col_x = col_x0 + item_ct1.get_local_id(2); + + if (col_x >= ncols_x) { + break; + } + + // x is transposed and permuted + const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x; + const float xi = + sycl::vec{x[ix]} + .convert()[0]; + + const int row_y = col_x; + + + // y is not transposed but permuted + const int iy = channel*nrows_y + row_y; + + tmp += xi * y[iy]; + } + + // dst is not transposed and not permuted + const int idst = channel*nrows_dst + row_dst; + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:24: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[idst] = tmp; + } +} + +static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, + const int row_stride_x, const int channel_stride_x, const int channel_x_divisor, + const sycl::nd_item<3> &item_ct1) { + + const sycl::half *x = (const sycl::half *)vx; + + const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0); + const int channel_x = channel / channel_x_divisor; + + const int nrows_y = ncols_x; + const int nrows_dst = nrows_x; + const int row_dst = row_x; + + const int idst = channel*nrows_dst + row_dst; + + float tmp = 0.0f; + + for (int col_x0 = 0; col_x0 < ncols_x; + col_x0 += item_ct1.get_local_range(2)) { + const int col_x = col_x0 + item_ct1.get_local_id(2); + + if (col_x >= ncols_x) { + break; + } + + const int row_y = col_x; + + const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; + const int iy = channel*nrows_y + row_y; + + const float xi = + sycl::vec{x[ix]} + .convert()[0]; + + tmp += xi * y[iy]; + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1023:25: The SYCL sub-group does not support mask options for + dpct::permute_sub_group_by_xor. You can specify + "--use-experimental-features=masked-sub-group-operation" to use the + experimental helper function to migrate __shfl_xor_sync. + */ + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[idst] = tmp; + } +} + +static void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + sycl::half *dsti = (sycl::half *)cdsti; + + *dsti = sycl::vec{(*xi)} + .convert()[0]; +} + +static void cpy_1_f16_f16(const char * cxi, char * cdsti) { + const sycl::half *xi = (const sycl::half *)cxi; + sycl::half *dsti = (sycl::half *)cdsti; + + *dsti = *xi; +} + +template +static void cpy_f32_f16(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= ne) { + return; + } + + // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int i02 = i / (ne00*ne01); + const int i01 = (i - i02*ne01*ne00) / ne00; + const int i00 = i - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + + const int i12 = i / (ne10*ne11); + const int i11 = (i - i12*ne10*ne11) / ne10; + const int i10 = i - i12*ne10*ne11 - i11*ne10; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = sycl::fmax(amax, sycl::fabs((float)v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j]*id; + + dsti->qs[j] = sycl::round((float)x0); + } +} + +static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float)v)) { + amax = sycl::fabs((float)v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = xi[0 + j]*id; + const float x1 = xi[QK4_0/2 + j]*id; + + const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + if (v < vmin) vmin = v; + if (v > vmax) vmax = v; + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = vmin; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (xi[0 + j] - vmin)*id; + const float x1 = (xi[QK4_1/2 + j] - vmin)*id; + + const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +template +static void cpy_f32_q(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, + const sycl::nd_item<3> &item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2)) * + qk; + + if (i >= ne) { + return; + } + + const int i02 = i / (ne00*ne01); + const int i01 = (i - i02*ne01*ne00) / ne00; + const int i00 = (i - i02*ne01*ne00 - i01*ne00); + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + + const int i12 = i / (ne10*ne11); + const int i11 = (i - i12*ne10*ne11) / ne10; + const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low); + return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y)); +} + +struct rope_corr_dims { + float v[4]; +}; + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale); + } + *cos_theta = sycl::cos(theta) * mscale; + *sin_theta = sycl::sin(theta) * mscale; +} + +// rope == RoPE == rotary positional embedding +template +static void rope( + const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + float ext_factor, float attn_factor, rope_corr_dims corr_dims +, + const sycl::nd_item<3> &item_ct1) { + const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (col >= ncols) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int i = row*ncols + col; + const int i2 = row/p_delta_rows; + + const int p = has_pos ? pos[i2] : 0; + const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + 1]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + 1] = x0*sin_theta + x1*cos_theta; +} + +template +static void rope_neox( + const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims +, + const sycl::nd_item<3> &item_ct1) { + const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (col >= ncols) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int ib = col / n_dims; + const int ic = col % n_dims; + + if (ib > 0) { + const int i = row*ncols + ib*n_dims + ic; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ncols + ib*n_dims + ic/2; + const int i2 = row/p_delta_rows; + + float cur_rot = inv_ndims * ic - ib; + + const int p = has_pos ? pos[i2] : 0; + const float theta_base = + p * freq_scale * dpct::pow(theta_scale, col / 2.0f); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + n_dims/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; +} + +static void rope_glm_f32( + const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + int n_ctx +, const sycl::nd_item<3> &item_ct1) { + const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int half_n_dims = ncols/4; + + if (col >= half_n_dims) { + return; + } + + const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i = row*ncols + col; + const int i2 = row/p_delta_rows; + + const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols); + // FIXME: this is likely wrong + const int p = pos != nullptr ? pos[i2] : 0; + + const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale; + const float sin_theta = sycl::sin((float)theta); + const float cos_theta = sycl::cos((float)theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + half_n_dims]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; + + const float block_theta = + ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale; + const float sin_block_theta = sycl::sin((float)block_theta); + const float cos_block_theta = sycl::cos((float)block_theta); + + const float x2 = x[i + half_n_dims * 2]; + const float x3 = x[i + half_n_dims * 3]; + + dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta; + dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; +} + +static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows, + const int n_heads_log2_floor, const float m0, const float m1, + const sycl::nd_item<3> &item_ct1) { + const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (col >= ncols) { + return; + } + + const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i = row*ncols + col; + + const int k = row/k_rows; + + float m_k; + if (k < n_heads_log2_floor) { + m_k = dpct::pow(m0, k + 1); + } else { + m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + dst[i] = col * m_k + x[i]; +} + +static void k_sum_rows_f32(const float * x, float * dst, const int ncols, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(1); + const int col = item_ct1.get_local_id(2); + + float sum = 0.0f; + for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) { + sum += x[row * ncols + i]; + } + + sum = warp_reduce_sum(sum, item_ct1); + + if (col == 0) { + dst[row] = sum; + } +} + +template +static inline void swap(T & a, T & b) { + T tmp = a; + a = b; + b = tmp; +} + +template +static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, + const sycl::nd_item<3> &item_ct1) { + // bitonic sort + int col = item_ct1.get_local_id(2); + int row = item_ct1.get_group(1); + + if (col >= ncols) return; + + const float * x_row = x + row * ncols; + int * dst_row = dst + row * ncols; + + // initialize indices + if (col < ncols) { + dst_row[col] = col; + } + /* + DPCT1065:73: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + + for (int k = 2; k <= ncols; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + swap(dst_row[col], dst_row[ixj]); + } + } else { + if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + swap(dst_row[col], dst_row[ixj]); + } + } + } + /* + DPCT1118:26: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:74: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + } + } +} + +static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past, + const sycl::nd_item<3> &item_ct1) { + const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (col >= ncols) { + return; + } + + const int i = row*ncols + col; + //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; + //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU + dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; +} + +static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale, + const sycl::nd_item<3> &item_ct1, float *buf) { + const int tid = item_ct1.get_local_id(2); + const int rowx = item_ct1.get_group(2); + const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension + + const int block_size = item_ct1.get_local_range(2); + + const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + + float max_val = -INFINITY; + + for (int col = tid; col < ncols; col += block_size) { + const int ix = rowx*ncols + col; + const int iy = rowy*ncols + col; + max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f)); + } + + // find the max value in the block + max_val = warp_reduce_max(max_val, item_ct1); + if (block_size > WARP_SIZE) { + if (warp_id == 0) { + buf[lane_id] = -INFINITY; + } + /* + DPCT1118:27: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:75: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + if (lane_id == 0) { + buf[warp_id] = max_val; + } + /* + DPCT1118:28: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:76: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + max_val = buf[lane_id]; + max_val = warp_reduce_max(max_val, item_ct1); + } + + float tmp = 0.f; + + for (int col = tid; col < ncols; col += block_size) { + const int ix = rowx*ncols + col; + const int iy = rowy*ncols + col; + const float val = + sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val); + tmp += val; + dst[ix] = val; + } + + // find the sum of exps in the block + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + if (warp_id == 0) { + buf[lane_id] = 0.f; + } + /* + DPCT1118:29: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:77: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + if (lane_id == 0) { + buf[warp_id] = tmp; + } + /* + DPCT1118:30: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:78: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + tmp = buf[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + const float inv_tmp = 1.f / tmp; + + for (int col = tid; col < ncols; col += block_size) { + const int i = rowx*ncols + col; + dst[i] *= inv_tmp; + } +} + +static void scale_f32(const float * x, float * dst, const float scale, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + dst[i] = scale * x[i]; +} + +static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); +} + +static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta, + int IW, int IH, int OW, int KW, int KH, + int pelements, int CHW, int s0, int s1, int p0, + int p1, int d0, int d1, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (i >= pelements) { + return; + } + + const int ksize = OW * (KH > 1 ? KW : 1); + const int kx = i / ksize; + const int kd = kx * ksize; + const int ky = (i - kd) / OW; + const int ix = i % OW; + + const int64_t iiw = ix * s0 + kx * d0 - p0; + const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1; + + const int64_t offset_dst = + (item_ct1.get_group(1) * OW + ix) * CHW + + (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx); + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst[offset_dst] = + sycl::vec{0.0f} + .convert()[0]; + } else { + const int64_t offset_src = item_ct1.get_group(0) * offset_delta; + dst[offset_dst] = + sycl::vec{x[offset_src + iih * IW + iiw]} + .convert()[0]; + } +} + +template +static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const void *src0_dd, + const int32_t *src1_dd, float *dst_dd, + dpct::queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + GGML_ASSERT(ne00 % 2 == 0); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_get_rows( + src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, + s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); + }); + + (void) dst; +} + +template +static void get_rows_cuda_float(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const src0_t *src0_dd, const int32_t *src1_dd, + float *dst_dd, dpct::queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, + s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); + }); + } + + (void) dst; +} + +template +struct bin_bcast_cuda { + template + void operator()(const struct ggml_tensor *src0, + const struct ggml_tensor *src1, struct ggml_tensor *dst, + const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd, + dpct::queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + int nr0 = ne10/ne0; + int nr1 = ne11/ne1; + int nr2 = ne12/ne2; + int nr3 = ne13/ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne0[] = {ne0, ne1, ne2, ne3}; + int64_t cne1[] = {ne10, ne11, ne12, ne13}; + size_t cnb0[] = {nb0, nb1, nb2, nb3}; + size_t cnb1[] = {nb10, nb11, nb12, nb13}; + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne0); + collapse(cne1); + } + } + { + int64_t ne0 = cne0[0]; + int64_t ne1 = cne0[1]; + int64_t ne2 = cne0[2]; + int64_t ne3 = cne0[3]; + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb0[0]; + size_t nb1 = cnb0[1]; + size_t nb2 = cnb0[2]; + size_t nb3 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0/2LL, 1LL); + + sycl::range<3> block_dims(1, 1, 1); + block_dims[2] = std::min(hne0, block_size); + block_dims[1] = + std::min(ne1, block_size / block_dims[2]); + block_dims[0] = std::min( + std::min(ne2 * ne3, block_size / block_dims[2] / + block_dims[1]), + 64U); + + sycl::range<3> block_nums( + (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], + (ne1 + block_dims[1] - 1) / block_dims[1], + (hne0 + block_dims[2] - 1) / block_dims[2]); + + if (block_nums[0] > 65535) { + // this is the maximum number of blocks in z direction, fallback to 1D grid kernel + int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * + sycl::range<3>(1, 1, block_size), + sycl::range<3>(1, 1, block_size)), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast_unravel( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, + s13, item_ct1); + }); + } + } else { + /* + DPCT1049:31: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, + ne2, ne3, ne10, ne11, ne12, ne13, + s1, s2, s3, s11, s12, s13, + item_ct1); + }); + } + } + } +}; + +static void acc_f32_cuda(const float *x, const float *y, float *dst, + const int n_elements, const int ne10, const int ne11, + const int ne12, const int nb1, const int nb2, + const int offset, dpct::queue_ptr stream) { + int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, + item_ct1); + }); +} + +static void gelu_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + gelu_f32(x, dst, k, item_ct1); + }); +} + +static void silu_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + silu_f32(x, dst, k, item_ct1); + }); +} + +static void gelu_quick_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + gelu_quick_f32(x, dst, k, item_ct1); + }); +} + +static void tanh_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + tanh_f32(x, dst, k, item_ct1); + }); +} + +static void relu_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + relu_f32(x, dst, k, item_ct1); + }); +} + +static void leaky_relu_f32_cuda(const float *x, float *dst, const int k, + const float negative_slope, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + leaky_relu_f32(x, dst, k, negative_slope, item_ct1); + }); +} + +static void sqr_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + sqr_f32(x, dst, k, item_ct1); + }); +} + +static void norm_f32_cuda(const float *x, float *dst, const int ncols, + const int nrows, const float eps, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1( + sycl::range<1>(32), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + norm_f32(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } else { + const sycl::range<3> block_dims(1, 1, 1024); + /* + DPCT1049:32: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1( + sycl::range<1>(32), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + norm_f32<1024>(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } +} + +static void group_norm_f32_cuda(const float *x, float *dst, + const int num_groups, const int group_size, + const int ne_elements, dpct::queue_ptr stream) { + static const float eps = 1e-6f; + if (group_size < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), + cgh); + + const float eps_ct4 = eps; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + group_norm_f32( + x, dst, group_size, ne_elements, eps_ct4, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } else { + const sycl::range<3> block_dims(1, 1, 1024); + /* + DPCT1049:33: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), + cgh); + + const float eps_ct4 = eps; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + group_norm_f32<1024>(x, dst, group_size, ne_elements, + eps_ct4, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } +} + +static void concat_f32_cuda(const float *x, const float *y, float *dst, + const int ne0, int ne1, int ne2, int ne02, + dpct::queue_ptr stream) { + int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE; + sycl::range<3> gridDim(ne2, ne1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32(x, y, dst, ne0, ne02, item_ct1); + }); +} + +static void upscale_f32_cuda(const float *x, float *dst, const int ne00, + const int ne01, const int ne02, + const int scale_factor, dpct::queue_ptr stream) { + int ne0 = (ne00 * scale_factor); + int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; + sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks); + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1); + }); +} + +static void pad_f32_cuda(const float *x, float *dst, const int ne00, + const int ne01, const int ne02, const int ne0, + const int ne1, const int ne2, dpct::queue_ptr stream) { + int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; + sycl::range<3> gridDim(ne2, ne1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1); + }); +} + +static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols, + const int nrows, const float eps, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), + cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + rms_norm_f32(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } else { + const sycl::range<3> block_dims(1, 1, 1024); + /* + DPCT1049:34: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), + cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } +} + +static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx, + const int ky, const int kx_padded, + dpct::queue_ptr stream) { + const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const sycl::range<3> num_blocks(1, ky, block_num_x); + const sycl::range<3> block_size(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(num_blocks * block_size, block_size), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + quantize_q8_1(x, vy, kx, kx_padded, item_ct1); + }); + } +} + +template +static void dequantize_block_cuda(const void *__restrict__ vx, + dst_t *__restrict__ y, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block(vx, y, k, item_ct1); + }); + } +} + +template +static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q2_K(vx, y, item_ct1); + }); + } +#else + dequantize_block_q2_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K(vx, y, item_ct1); + }); + } +#else + dequantize_block_q3_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_K(vx, y, item_ct1); + }); + } +} + +template +static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q5_K(vx, y, item_ct1); + }); + } +#else + dequantize_block_q5_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q6_K(vx, y, item_ct1); + }); + } +#else + dequantize_block_q6_K<<>>(vx, y); +#endif +} + +static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_block_cuda; + case GGML_TYPE_Q4_1: + return dequantize_block_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + return dequantize_block_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; + case GGML_TYPE_F32: + return dequantize_block_cuda<1, 1, convert_f32>; + default: + return nullptr; + } +} + +static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_block_cuda; + case GGML_TYPE_Q4_1: + return dequantize_block_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + return dequantize_block_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; + case GGML_TYPE_F16: + return dequantize_block_cuda<1, 1, convert_f16>; + default: + return nullptr; + } +} + +static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const sycl::range<3> block_dims(1, 1, 32); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, + nrows, item_ct1); + }); + } +} + +static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_1 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_1 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1, stream_ct1); + }); + }); +} + +static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_0_RDNA2; + mmq_y = MMQ_Y_Q4_0_RDNA2; + nwarps = NWARPS_Q4_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_0_RDNA1; + mmq_y = MMQ_Y_Q4_0_RDNA1; + nwarps = NWARPS_Q4_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_0_AMPERE; + mmq_y = MMQ_Y_Q4_0_AMPERE; + nwarps = NWARPS_Q4_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_0_PASCAL; + mmq_y = MMQ_Y_Q4_0_PASCAL; + nwarps = NWARPS_Q4_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:35: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:36: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_1_RDNA2; + mmq_y = MMQ_Y_Q4_1_RDNA2; + nwarps = NWARPS_Q4_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_1_RDNA1; + mmq_y = MMQ_Y_Q4_1_RDNA1; + nwarps = NWARPS_Q4_1_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_1_AMPERE; + mmq_y = MMQ_Y_Q4_1_AMPERE; + nwarps = NWARPS_Q4_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_1_PASCAL; + mmq_y = MMQ_Y_Q4_1_PASCAL; + nwarps = NWARPS_Q4_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:37: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:38: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_0_RDNA2; + mmq_y = MMQ_Y_Q5_0_RDNA2; + nwarps = NWARPS_Q5_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_0_RDNA1; + mmq_y = MMQ_Y_Q5_0_RDNA1; + nwarps = NWARPS_Q5_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_0_AMPERE; + mmq_y = MMQ_Y_Q5_0_AMPERE; + nwarps = NWARPS_Q5_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_0_PASCAL; + mmq_y = MMQ_Y_Q5_0_PASCAL; + nwarps = NWARPS_Q5_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:39: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:40: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_1_RDNA2; + mmq_y = MMQ_Y_Q5_1_RDNA2; + nwarps = NWARPS_Q5_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_1_RDNA1; + mmq_y = MMQ_Y_Q5_1_RDNA1; + nwarps = NWARPS_Q5_1_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_1_AMPERE; + mmq_y = MMQ_Y_Q5_1_AMPERE; + nwarps = NWARPS_Q5_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_1_PASCAL; + mmq_y = MMQ_Y_Q5_1_PASCAL; + nwarps = NWARPS_Q5_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:41: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:42: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q8_0_RDNA2; + mmq_y = MMQ_Y_Q8_0_RDNA2; + nwarps = NWARPS_Q8_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q8_0_RDNA1; + mmq_y = MMQ_Y_Q8_0_RDNA1; + nwarps = NWARPS_Q8_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q8_0_AMPERE; + mmq_y = MMQ_Y_Q8_0_AMPERE; + nwarps = NWARPS_Q8_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q8_0_PASCAL; + mmq_y = MMQ_Y_Q8_0_PASCAL; + nwarps = NWARPS_Q8_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:43: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:44: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q2_K_RDNA2; + mmq_y = MMQ_Y_Q2_K_RDNA2; + nwarps = NWARPS_Q2_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q2_K_RDNA1; + mmq_y = MMQ_Y_Q2_K_RDNA1; + nwarps = NWARPS_Q2_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q2_K_AMPERE; + mmq_y = MMQ_Y_Q2_K_AMPERE; + nwarps = NWARPS_Q2_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q2_K_PASCAL; + mmq_y = MMQ_Y_Q2_K_PASCAL; + nwarps = NWARPS_Q2_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:45: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:46: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + +#if QK_K == 256 + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q3_K_RDNA2; + mmq_y = MMQ_Y_Q3_K_RDNA2; + nwarps = NWARPS_Q3_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q3_K_RDNA1; + mmq_y = MMQ_Y_Q3_K_RDNA1; + nwarps = NWARPS_Q3_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q3_K_AMPERE; + mmq_y = MMQ_Y_Q3_K_AMPERE; + nwarps = NWARPS_Q3_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q3_K_PASCAL; + mmq_y = MMQ_Y_Q3_K_PASCAL; + nwarps = NWARPS_Q3_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:47: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:48: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +#endif +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_K_RDNA2; + mmq_y = MMQ_Y_Q4_K_RDNA2; + nwarps = NWARPS_Q4_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_K_RDNA1; + mmq_y = MMQ_Y_Q4_K_RDNA1; + nwarps = NWARPS_Q4_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_K_AMPERE; + mmq_y = MMQ_Y_Q4_K_AMPERE; + nwarps = NWARPS_Q4_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_K_PASCAL; + mmq_y = MMQ_Y_Q4_K_PASCAL; + nwarps = NWARPS_Q4_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:49: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:50: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_K_RDNA2; + mmq_y = MMQ_Y_Q5_K_RDNA2; + nwarps = NWARPS_Q5_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_K_RDNA1; + mmq_y = MMQ_Y_Q5_K_RDNA1; + nwarps = NWARPS_Q5_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_K_AMPERE; + mmq_y = MMQ_Y_Q5_K_AMPERE; + nwarps = NWARPS_Q5_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_K_PASCAL; + mmq_y = MMQ_Y_Q5_K_PASCAL; + nwarps = NWARPS_Q5_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:51: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:52: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q6_K_RDNA2; + mmq_y = MMQ_Y_Q6_K_RDNA2; + nwarps = NWARPS_Q6_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q6_K_RDNA1; + mmq_y = MMQ_Y_Q6_K_RDNA1; + nwarps = NWARPS_Q6_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q6_K_AMPERE; + mmq_y = MMQ_Y_Q6_K_AMPERE; + nwarps = NWARPS_Q6_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q6_K_PASCAL; + mmq_y = MMQ_Y_Q6_K_PASCAL; + nwarps = NWARPS_Q6_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:53: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } else { + const bool need_check = true; + /* + DPCT1049:54: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::stream stream_ct1(64 * 1024, 80, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K(vx, vy, dst, ncols_x, nrows_x, + ncols_y, nrows_y, nrows_dst, + stream_ct1); + }); + }); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y, + float *dst, const int ncols_x, + const int nrows_x, + const int nchannels_x, + const int nchannels_y, + dpct::queue_ptr stream) { + + const sycl::range<3> block_nums(nchannels_y, nrows_x, 1); + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, + nchannels_y, item_ct1); + }); + } +} + +static void ggml_mul_mat_vec_nc_f16_f32_cuda( + const void *vx, const float *y, float *dst, const int ncols_x, + const int nrows_x, const int row_stride_x, const int nchannels_x, + const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) { + + const sycl::range<3> block_nums(nchannels_y, nrows_x, 1); + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, + row_stride_x, channel_stride_x, + nchannels_y / nchannels_x, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, + nb02, ne10, ne11, nb10, nb11, nb12, + item_ct1); + }); + } +} + +static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, + nb02, ne10, ne11, nb10, nb11, nb12, + item_ct1); + }); + } +} + +static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + GGML_ASSERT(ne % QK8_0 == 0); + const int num_blocks = ne / QK8_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), + sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q( + cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + GGML_ASSERT(ne % QK4_0 == 0); + const int num_blocks = ne / QK4_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), + sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q( + cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + GGML_ASSERT(ne % QK4_1 == 0); + const int num_blocks = ne / QK4_1; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), + sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q( + cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, item_ct1); + }); +} + +static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, + nb02, ne10, ne11, nb10, nb11, nb12, + item_ct1); + }); + } +} + +static void scale_f32_cuda(const float *x, float *dst, const float scale, + const int k, dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + scale_f32(x, dst, scale, k, item_ct1); + }); +} + +static void clamp_f32_cuda(const float *x, float *dst, const float min, + const float max, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + clamp_f32(x, dst, min, max, k, item_ct1); + }); +} + +template +static void rope_cuda(const T *x, T *dst, int ncols, int nrows, + const int32_t *pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, + rope_corr_dims corr_dims, dpct::queue_ptr stream) { + GGML_ASSERT(ncols % 2 == 0); + const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nrows); + if (pos == nullptr) { + /* + DPCT1049:55: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope(x, dst, ncols, pos, freq_scale, p_delta_rows, + freq_base, ext_factor, attn_factor, corr_dims, + item_ct1); + }); + } else { + /* + DPCT1049:56: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope(x, dst, ncols, pos, freq_scale, p_delta_rows, + freq_base, ext_factor, attn_factor, corr_dims, + item_ct1); + }); + } +} + +template +static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows, + const int32_t *pos, float freq_scale, + int p_delta_rows, float freq_base, float ext_factor, + float attn_factor, rope_corr_dims corr_dims, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % 2 == 0); + const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nrows); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.0f / n_dims; + + if (pos == nullptr) { + /* + DPCT1049:57: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ncols, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, inv_ndims, + item_ct1); + }); + } else { + /* + DPCT1049:58: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ncols, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, inv_ndims, item_ct1); + }); + } +} + +static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows, + const int32_t *pos, float freq_scale, + int p_delta_rows, float freq_base, int n_ctx, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % 4 == 0); + const sycl::range<3> block_dims(1, 1, CUDA_ROPE_BLOCK_SIZE / 4); + const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; + const sycl::range<3> block_nums(1, nrows, num_blocks_x); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_glm_f32(x, dst, ncols, pos, freq_scale, + p_delta_rows, freq_base, n_ctx, + item_ct1); + }); +} + +static void alibi_f32_cuda(const float *x, float *dst, const int ncols, + const int nrows, const int k_rows, + const int n_heads_log2_floor, const float m0, + const float m1, dpct::queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, CUDA_ALIBI_BLOCK_SIZE); + const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE); + const sycl::range<3> block_nums(1, nrows, num_blocks_x); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + alibi_f32(x, dst, ncols, k_rows, + n_heads_log2_floor, m0, m1, item_ct1); + }); +} + +static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols, + const int nrows, dpct::queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + const sycl::range<3> block_nums(1, nrows, 1); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + k_sum_rows_f32(x, dst, ncols, item_ct1); + }); +} + +static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols, + const int nrows, ggml_sort_order order, + dpct::queue_ptr stream) { + // bitonic sort requires ncols to be power of 2 + GGML_ASSERT((ncols & (ncols - 1)) == 0); + + const sycl::range<3> block_dims(1, 1, ncols); + const sycl::range<3> block_nums(1, nrows, 1); + if (order == GGML_SORT_ASC) { + /* + DPCT1049:59: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32(x, dst, ncols, item_ct1); + }); + } else if (order == GGML_SORT_DESC) { + /* + DPCT1049:60: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32(x, dst, ncols, item_ct1); + }); + } else { + GGML_ASSERT(false); + } +} + +static void diag_mask_inf_f32_cuda(const float *x, float *dst, + const int ncols_x, const int nrows_x, + const int rows_per_channel, const int n_past, + dpct::queue_ptr stream) { + const sycl::range<3> block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); + const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; + const sycl::range<3> block_nums(1, block_num_x, nrows_x); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + diag_mask_inf_f32(x, dst, ncols_x, + rows_per_channel, n_past, + item_ct1); + }); +} + +static void soft_max_f32_cuda(const float *x, const float *y, float *dst, + const int ncols_x, const int nrows_x, + const int nrows_y, const float scale, + dpct::queue_ptr stream) { + int nth = WARP_SIZE; + while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; + const sycl::range<3> block_dims(1, 1, nth); + const sycl::range<3> block_nums(1, 1, nrows_x); + /* + DPCT1049:61: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + /* + DPCT1101:111: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was + replaced with a value. Modify the code to use the original expression, + provided in comments, if it is correct. + */ + sycl::local_accessor buf_acc_ct1( + sycl::range<1>(32 /*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1, + buf_acc_ct1.get_pointer()); + }); + }); +} + +static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH, + int OW, int OH, int KW, int KH, int IC, + int offset_delta, int s0, int s1, int p0, + int p1, int d0, int d1, + dpct::queue_ptr stream) { + const int parallel_elements = OW * KW * KH; + const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; + sycl::range<3> block_nums(IC, OH, num_blocks); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>(block_nums * + sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH, + parallel_elements, (IC * KH * KW), s0, s1, p0, + p1, d0, d1, item_ct1); + }); + } +} + +// buffer pool for cuda +#define MAX_CUDA_BUFFERS 256 + +struct scoped_spin_lock { + std::atomic_flag& lock; + scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { + while (lock.test_and_set(std::memory_order_acquire)) { + ; // spin + } + } + ~scoped_spin_lock() { + lock.clear(std::memory_order_release); + } + scoped_spin_lock(const scoped_spin_lock&) = delete; + scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; +}; + +static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; + +// #define DEBUG_CUDA_MALLOC +struct cuda_buffer { + void * ptr = nullptr; + size_t size = 0; +}; + +static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS]; +static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0}; + +static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); +#ifdef DEBUG_CUDA_MALLOC + int nnz = 0; + size_t max_size = 0; +#endif + size_t best_diff = 1ull << 36; + int ibest = -1; + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[id][i]; + if (b.ptr != nullptr) { +#ifdef DEBUG_CUDA_MALLOC + ++nnz; + if (b.size > max_size) max_size = b.size; +#endif + if (b.size >= size) { + size_t diff = b.size - size; + if (diff < best_diff) { + best_diff = diff; + ibest = i; + if (!best_diff) { + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + } + } + } + if (ibest >= 0) { + cuda_buffer& b = g_cuda_buffer_pool[id][ibest]; + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + void * ptr; + size_t look_ahead_size = (size_t) (1.05 * size); + look_ahead_size = 256 * ((look_ahead_size + 255)/256); + CUDA_CHECK( + DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device( + look_ahead_size, dpct::get_in_order_queue()))); + *actual_size = look_ahead_size; + g_cuda_pool_size[id] += look_ahead_size; +#ifdef DEBUG_CUDA_MALLOC + fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz, + (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024)); +#endif + return ptr; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[id][i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue()))); + g_cuda_pool_size[id] -= size; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +#if !defined(GGML_USE_HIPBLAS) +// pool with virtual memory +/* +DPCT1082:79: Migration of CUmemGenericAllocationHandle type is not supported. +*/ +// static std::vector +// g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES]; +static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0}; +static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0}; +static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB + +static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + + // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types + const size_t alignment = 128; + size = alignment * ((size + alignment - 1) / alignment); + + size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id]; + + if (size > avail) { + // round up to the next multiple of the granularity + size_t reserve_size = size - avail; + const size_t granularity = g_device_caps[id].vmm_granularity; + reserve_size = granularity * ((reserve_size + granularity - 1) / granularity); + + GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE); + + // allocate more physical memory + /* + DPCT1082:80: Migration of CUmemAllocationProp type is not supported. + */ + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = id; + /* + DPCT1082:81: Migration of CUmemGenericAllocationHandle type is not + supported. + */ + // CUmemGenericAllocationHandle handle; + /* + DPCT1007:84: Migration of cuMemCreate is not supported. + */ + // CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0)); + + // reserve virtual address space (if not already reserved) + if (g_cuda_pool_addr[id] == 0) { + /* + DPCT1007:85: Migration of cuMemAddressReserve is not supported. + */ + // CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], + // CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); + } + + // map at the end of the pool + /* + DPCT1007:86: Migration of cuMemMap is not supported. + */ + // CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], + // reserve_size, 0, handle, 0)); + + // set access + /* + DPCT1082:87: Migration of CUmemAccessDesc type is not supported. + */ + CUmemAccessDesc access = {}; + access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access.location.id = id; + access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + /* + DPCT1007:88: Migration of cuMemSetAccess is not supported. + */ + CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], + reserve_size, &access, 1)); + + // add to the pool + // g_cuda_pool_handles[id].push_back(handle); + g_cuda_pool_size[id] += reserve_size; + + //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n", + // id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024), + // (unsigned long long) (reserve_size/1024/1024)); + } + + GGML_ASSERT(g_cuda_pool_addr[id] != 0); + + void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]); + *actual_size = size; + g_cuda_pool_used[id] += size; + +#ifdef DEBUG_CUDA_MALLOC + printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr); +#endif + + return ptr; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + +#ifdef DEBUG_CUDA_MALLOC + printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr); +#endif + + g_cuda_pool_used[id] -= size; + + // all deallocations must be in reverse order of the allocations + GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id])); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try { + int id; + + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + if (g_device_caps[id].vmm) { + return ggml_cuda_pool_malloc_vmm(size, actual_size); + } else { + return ggml_cuda_pool_malloc_leg(size, actual_size); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_pool_free(void *ptr, size_t size) try { + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + if (g_device_caps[id].vmm) { + ggml_cuda_pool_free_vmm(ptr, size); + } else { + ggml_cuda_pool_free_leg(ptr, size); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} +#else +#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg +#define ggml_cuda_pool_free ggml_cuda_pool_free_leg +#endif // !defined(GGML_USE_HIPBLAS) + +template +struct cuda_pool_alloc { + T * ptr = nullptr; + size_t actual_size = 0; + + // size is in number of elements + T * alloc(size_t size) { + GGML_ASSERT(ptr == nullptr); + ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size); + return ptr; + } + + cuda_pool_alloc(size_t size) { + alloc(size); + } + + ~cuda_pool_alloc() { + if (ptr != nullptr) { + ggml_cuda_pool_free(ptr, actual_size); + } + } + + T * get() { + return ptr; + } + + cuda_pool_alloc() = default; + cuda_pool_alloc(const cuda_pool_alloc &) = delete; + cuda_pool_alloc(cuda_pool_alloc &&) = delete; + cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete; + cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete; +}; + +static bool g_cublas_loaded = false; + +bool ggml_cublas_loaded(void) { + return g_cublas_loaded; +} + +void print_devices(int device_count){ + for (int id = 0; id < device_count; ++id) { + dpct::device_info prop; + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(id)))); + + fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, + prop.get_name(), prop.get_major_version(), + prop.get_minor_version()); + } +} + +int get_env_value(const char *env_name, int default_val){ + char * user_device_string = getenv(env_name); + int user_device_number = -1; + + unsigned n; + if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) { + user_device_number = (int)n; + } else { + user_device_number=default_val; + } +} +void ggml_init_cublas() try { + static bool initialized = false; + + if (!initialized) { + +#ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: + // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + + g_device_count = dpct::dev_mgr::instance().device_count(); + if (DPCT_CHECK_ERROR(g_device_count != 0)) { + initialized = true; + g_cublas_loaded = false; + return; + } + + GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); + int64_t total_vram = 0; +#if defined(GGML_CUDA_FORCE_MMQ) + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); +#endif +#if defined(CUDA_USE_TENSOR_CORES) + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__); +#else + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__); +#endif + fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); + print_devices(g_device_count); + + //zjy hardcode, force set to 1 device + g_device_count = 1; + + for (int id = 0; id < g_device_count; ++id) { + int device_vmm = 0; + +#if !defined(GGML_USE_HIPBLAS) + //int device; + //CU_CHECK(DPCT_CHECK_ERROR(device = id)); + /* + DPCT1028:89: The cuDeviceGetAttribute was not migrated because + parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is + unsupported. + */ + /*CU_CHECK(cuDeviceGetAttribute( + &device_vmm, + CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device)); + */ + //if (device_vmm) { + /* + DPCT1082:90: Migration of CUmemAllocationProp type is not + supported. + */ + //CUmemAllocationProp alloc_prop = {}; + //alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + //alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + //alloc_prop.location.id = id; + /* + DPCT1007:91: Migration of cuMemGetAllocationGranularity is not + supported. + */ + //CU_CHECK(cuMemGetAllocationGranularity( + // &g_device_caps[id].vmm_granularity, &alloc_prop, + // CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + //} +#endif // !defined(GGML_USE_HIPBLAS) + g_device_caps[id].vmm = !!device_vmm; + + dpct::device_info prop; + dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(id))ï¼› + + // CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( + // prop, dpct::dev_mgr::instance().get_device(id)))); + /* + DPCT1005:92: The SYCL device version is different from CUDA Compute + Compatibility. You may need to rewrite this code. + */ + fprintf(stderr, + " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, + prop.get_name(), prop.get_major_version(), + prop.get_minor_version(), device_vmm ? "yes" : "no"); + + g_tensor_split[id] = total_vram; + total_vram += prop.get_global_mem_size(); +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; +#else + /* + DPCT1005:93: The SYCL device version is different from CUDA Compute + Compatibility. You may need to rewrite this code. + */ + g_device_caps[id].cc = + 100 * prop.get_major_version() + 10 * prop.get_minor_version(); +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + } + + int user_device_number = get_env_value("GGML_SYCL_DEVICE", 0); + + for (int id = 0; id < g_device_count; ++id) { + g_tensor_split[id] /= total_vram; + } + + for (int id = 0; id < g_device_count; ++id) { + ggml_cuda_set_device(id)ï¼› + // CUDA_CHECK(ggml_cuda_set_device(id)); + + // create cuda streams + for (int is = 0; is < MAX_STREAMS; ++is) { + /* + DPCT1025:105: The SYCL queue is created ignoring the flag and + priority options. + */ + g_cudaStreams[id][is] = + dpct::get_current_device().create_queue()ï¼› + // CUDA_CHECK(DPCT_CHECK_ERROR( + // g_cudaStreams[id][is] = + // dpct::get_current_device().create_queue())); + } + + // create cublas handle + g_cublas_handles[id] = &dpct::get_in_order_queue(); + // CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = + // &dpct::get_in_order_queue())); + /* + DPCT1027:107: The call to cublasSetMathMode was replaced with 0 + because this call is redundant in SYCL. + */ + CUBLAS_CHECK(0); + } + + // configure logging to stdout + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); + + + ggml_cuda_set_device(user_device_number); + fprintf(stderr, " set Device %d\n", user_device_number); + + initialized = true; + g_cublas_loaded = true; + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_set_tensor_split(const float * tensor_split) { + if (tensor_split == nullptr) { + return; + } + bool all_zero = true; + for (int i = 0; i < g_device_count; ++i) { + if (tensor_split[i] != 0.0f) { + all_zero = false; + break; + } + } + if (all_zero) { + return; + } + float split_sum = 0.0f; + for (int i = 0; i < g_device_count; ++i) { + g_tensor_split[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < g_device_count; ++i) { + g_tensor_split[i] /= split_sum; + } +} + +void *ggml_cuda_host_malloc(size_t size) try { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + dpct::err0 err = DPCT_CHECK_ERROR( + ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue())); + /* + DPCT1000:97: Error handling if-stmt was detected but could not be rewritten. + */ + if (err != 0) { + // clear the error + /* + DPCT1026:98: The call to cudaGetLastError was removed because this call + is redundant in SYCL. + */ + /* + DPCT1001:96: The statement could not be removed. + */ + fprintf( + stderr, + "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + /* + DPCT1009:99: SYCL uses exceptions to report errors and does not use + the error codes. The original code was commented out and a warning + string was inserted. You need to rewrite this code. + */ + size / 1024.0 / 1024.0, + "cudaGetErrorString is not supported" /*cudaGetErrorString(err)*/); + return nullptr; + } + + return ptr; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_host_free(void *ptr) try { + CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue()))); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst, + const struct ggml_tensor *src, + int64_t i3, int64_t i2, + int64_t i1_low, int64_t i1_high, + dpct::queue_ptr stream) try { + + dpct::memcpy_direction kind; + char * src_ptr; + if (src->backend == GGML_BACKEND_CPU) { + kind = dpct::host_to_device; + src_ptr = (char *) src->data; + } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + kind = dpct::device_to_device; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; + int id; + CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + src_ptr = (char *) extra->data_device[id]; + } else { + GGML_ASSERT(false); + } + char * dst_ptr = (char *) dst; + + const int64_t ne0 = src->ne[0]; + const int64_t nb0 = src->nb[0]; + const int64_t nb1 = src->nb[1]; + const int64_t nb2 = src->nb[2]; + const int64_t nb3 = src->nb[3]; + const enum ggml_type type = src->type; + const int64_t ts = ggml_type_size(type); + const int64_t bs = ggml_blck_size(type); + int64_t i1_diff = i1_high - i1_low; + + const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; + if (nb0 == ts && nb1 == ts*ne0/bs) { + return DPCT_CHECK_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1)); + } else if (nb0 == ts) { + return DPCT_CHECK_ERROR( + dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1, + ts * ne0 / bs, i1_diff, kind, *stream)); + } else { + for (int64_t i1 = 0; i1 < i1_diff; i1++) { + const void * rx = (const void *) ((const char *) x + i1*nb1); + void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); + // pretend the row is a matrix with cols=1 + dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy( + rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream)); + /* + DPCT1001:100: The statement could not be removed. + */ + /* + DPCT1000:101: Error handling if-stmt was detected but could not be + rewritten. + */ + if (r != 0) return r; + } + return 0; + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_op_get_rows(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_d, const float *src1_d, + float *dst_d, const dpct::queue_ptr &stream) { + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + + const int32_t * src1_i32 = (const int32_t *) src1_d; + + switch (src0->type) { + case GGML_TYPE_F16: + get_rows_cuda_float(src0, src1, dst, (const sycl::half *)src0_d, + src1_i32, dst_d, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + default: + // TODO: k-quants + fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); + GGML_ASSERT(false); + break; + } +} + +template +inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, + (sycl::half *)dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd, + main_stream); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, + ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } +} + +static void ggml_cuda_op_repeat(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_d, const float *src1_d, + float *dst_d, + const dpct::queue_ptr &main_stream) { + + ggml_cuda_op_bin_bcast>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream); + + (void) src1; + (void) src1_d; +} + +inline void ggml_cuda_op_add(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + + int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 + int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 + // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused + int offset = dst->op_params[3] / 4; // offset in bytes + + acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); + + (void) dst; +} + +inline void ggml_cuda_op_mul(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_cuda_op_div(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + + leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_group_norm(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int num_groups = dst->op_params[0]; + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); + group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_concat(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream); + } + + (void) src1; + (void) dst; +} + +inline void ggml_cuda_op_upscale(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + const int scale_factor = dst->op_params[0]; + + upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + pad_f32_cuda(src0_dd, dst_dd, + src0->ne[0], src0->ne[1], src0->ne[2], + dst->ne[0], dst->ne[1], dst->ne[2], main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_mul_mat_q( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) try { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + + int id; + id = dpct::dev_mgr::instance().current_device_id(); + // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into + const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_1: + ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_0: + ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_1: + ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q8_0: + ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q2_K: + ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q3_K: + ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_K: + ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_K: + ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q6_K: + ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddf_i; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static int64_t get_row_rounding(ggml_type type) { + int64_t min_compute_capability = INT_MAX; + int64_t max_compute_capability = INT_MIN; + for (int64_t id = 0; id < g_device_count; ++id) { + if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + if (min_compute_capability > g_device_caps[id].cc) { + min_compute_capability = g_device_caps[id].cc; + } + if (max_compute_capability < g_device_caps[id].cc) { + max_compute_capability = g_device_caps[id].cc; + } + } + } + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return 1; + case GGML_TYPE_Q2_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 32; + case GGML_TYPE_Q3_K: + return min_compute_capability < CC_RDNA2 ? 128 : 64; + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + default: + GGML_ASSERT(false); + } +#else + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= CC_VOLTA ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return max_compute_capability >= CC_VOLTA ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ASSERT(false); + } +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +} + +inline void ggml_cuda_op_mul_mat_vec_q( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) { + + GGML_ASSERT(ggml_nrows(src1) == 1); + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddf_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} + +inline void ggml_cuda_op_dequantize_mul_mat_vec( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) { + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_CUDA_F16 + cuda_pool_alloc src1_dfloat_a; + half * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = src1_dfloat_a.alloc(ne00); + ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, + ne00, 1, sizeof(float), 0, 0, + ne00, 1, sizeof(half), 0, 0, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_CUDA_F16 + + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddq_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} + +inline void ggml_cuda_op_mul_mat_cublas( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) try { + + GGML_ASSERT(src0_dd_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_dd_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + + int id; + id = dpct::dev_mgr::instance().current_device_id(); + // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + + // the main device has a larger memory buffer to hold the results from all GPUs + // ldc == nrows of the matrix that cuBLAS writes into + int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + + const int compute_capability = g_device_caps[id].cc; + + if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { + // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 + cuda_pool_alloc src0_as_f16; + if (src0->type != GGML_TYPE_F16) { + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + size_t ne = row_diff*ne00; + src0_as_f16.alloc(ne); + to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream); + } + const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16 + ? (const sycl::half *)src0_dd_i + : src0_as_f16.get(); + + cuda_pool_alloc src1_as_f16; + if (src1->type != GGML_TYPE_F16) { + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + size_t ne = src1_ncols*ne10; + src1_as_f16.alloc(ne); + to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream); + } + const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16 + ? (const sycl::half *)src1_ddf_i + : src1_as_f16.get(); + cuda_pool_alloc dst_f16(row_diff * src1_ncols); + + const sycl::half alpha_f16 = 1.0f; + const sycl::half beta_f16 = 0.0f; + + CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)); + CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm( + g_cublas_handles, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, + &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, + src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, + dst_f16.get(), dpct::library_data_t::real_half, ldc, + dpct::library_data_t::real_half))); + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream); + } + else { + cuda_pool_alloc src0_ddq_as_f32; + + if (src0->type != GGML_TYPE_F32) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + GGML_ASSERT(to_fp32_cuda != nullptr); + src0_ddq_as_f32.alloc(row_diff*ne00); + to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream); + } + const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get(); + + const float alpha = 1.0f; + const float beta = 0.0f; + + CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)); + CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm( + *g_cublas_handles[id], oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, alpha, + src0_ddf_i, ne00, src1_ddf_i, ne10, beta, dst_dd_i, ldc))); + } + + (void) dst; + (void) src1_ddq_i; + (void) src1_padded_row_size; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t nrows = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + // RoPE alteration for extended context + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + const int32_t * pos = nullptr; + if ((mode & 1) == 0) { + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(src1->ne[0] == ne2); + pos = (const int32_t *) src1_dd; + } + + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); + + // compute + if (is_glm) { + GGML_ASSERT(false); + rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream); + } else if (is_neox) { + if (src0->type == GGML_TYPE_F32) { + rope_neox_cuda( + (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_neox_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, + ne00, n_dims, nrows, pos, freq_scale, ne01, + freq_base, ext_factor, attn_factor, corr_dims, + main_stream); + } else { + GGML_ASSERT(false); + } + } else { + if (src0->type == GGML_TYPE_F32) { + rope_cuda( + (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, + nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream); + } else { + GGML_ASSERT(false); + } + } + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t nrows = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + //GGML_ASSERT(ne01 + n_past == ne00); + GGML_ASSERT(n_head == ne02); + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream); + + (void) src1; + (void) src1_dd; +} + +inline void ggml_cuda_op_im2col(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + + const int64_t IC = src1->ne[is_2D ? 2 : 1]; + const int64_t IH = is_2D ? src1->ne[1] : 1; + const int64_t IW = src1->ne[0]; + + const int64_t KH = is_2D ? src0->ne[1] : 1; + const int64_t KW = src0->ne[0]; + + const int64_t OH = is_2D ? dst->ne[2] : 1; + const int64_t OW = dst->ne[1]; + + const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + + im2col_f32_f16_cuda(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, + IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + + (void) src0; + (void) src0_dd; +} + +inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_argsort(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int nrows0 = ggml_nrows(src0); + + const int n_past = ((int32_t *) dst->op_params)[0]; + + diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_soft_max(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows_x = ggml_nrows(src0); + const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1; + + float scale = 1.0f; + memcpy(&scale, dst->op_params, sizeof(float)); + + soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream); + + (void) dst; +} + +inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); + + scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); + /* + DPCT1010:102: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + CUDA_CHECK(0); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float min; + float max; + memcpy(&min, dst->op_params, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); + + clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); + /* + DPCT1010:103: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + CUDA_CHECK(0); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +static void ggml_cuda_op_flatten(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const ggml_cuda_op_flatten_t op) try { + const int64_t nrows0 = ggml_nrows(src0); + + const bool use_src1 = src1 != nullptr; + const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + + // dd = data device + float * src0_ddf = nullptr; + float * src1_ddf = nullptr; + float * dst_ddf = nullptr; + + cuda_pool_alloc src0_f; + cuda_pool_alloc src1_f; + cuda_pool_alloc dst_f; + + ggml_cuda_set_device(g_main_device); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + if (src0_on_device) { + src0_ddf = (float *) src0_extra->data_device[g_main_device]; + } else { + src0_ddf = src0_f.alloc(ggml_nelements(src0)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); + } + + if (use_src1) { + if (src1_on_device) { + src1_ddf = (float *) src1_extra->data_device[g_main_device]; + } else { + src1_ddf = src1_f.alloc(ggml_nelements(src1)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); + } + } + if (dst_on_device) { + dst_ddf = (float *) dst_extra->data_device[g_main_device]; + } else { + dst_ddf = dst_f.alloc(ggml_nelements(dst)); + } + + // do the computation + op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); + /* + DPCT1010:104: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + CUDA_CHECK(0); + + // copy dst to host if necessary + if (!dst_on_device) { + CUDA_CHECK(DPCT_CHECK_ERROR( + main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)))); + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_current_device().queues_wait_and_throw())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_set_peer_access(const int n_tokens) { + static bool peer_access_enabled = false; + + const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE; + + if (peer_access_enabled == enable_peer_access) { + return; + } + +#ifdef NDEBUG + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + + for (int id_other = 0; id_other < g_device_count; ++id_other) { + if (id == id_other) { + continue; + } + if (id != g_main_device && id_other != g_main_device) { + continue; + } + + int can_access_peer; + CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other)); + if (can_access_peer) { + if (enable_peer_access) { + CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0)); + } else { + CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other)); + } + } + } + } +#endif // NDEBUG + + peer_access_enabled = enable_peer_access; +} + +static void ggml_cuda_op_mul_mat(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + ggml_cuda_op_mul_mat_t op, + const bool convert_src1_to_q8_1) try { + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + const int64_t nrows0 = ggml_nrows(src0); + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + const int64_t nrows1 = ggml_nrows(src1); + + GGML_ASSERT(ne03 == ne13); + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + + GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); + + const int64_t i02_divisor = ne12 / ne02; + + const size_t src0_ts = ggml_type_size(src0->type); + const size_t src0_bs = ggml_blck_size(src0->type); + const size_t q8_1_ts = sizeof(block_q8_1); + const size_t q8_1_bs = QK8_1; + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_is_contiguous = ggml_is_contiguous(src0); + const bool src1_is_contiguous = ggml_is_contiguous(src1); + + const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); + + const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + GGML_ASSERT(!(split && ne02 > 1)); + GGML_ASSERT(!(split && ne03 > 1)); + GGML_ASSERT(!(split && ne02 < ne12)); + + // dd = data device + char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr}; + float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float + char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1 + float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr}; + + // as = actual size + size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0}; + size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0}; + + int64_t row_low[GGML_CUDA_MAX_DEVICES]; + int64_t row_high[GGML_CUDA_MAX_DEVICES]; + + int used_devices = 0; + + for (int64_t id = 0; id < g_device_count; ++id) { + // by default, use all rows + row_low[id] = 0; + row_high[id] = ne01; + + // for multi GPU, get the row boundaries from tensor split + // and round to mul_mat_q tile sizes + if (split) { + const int64_t rounding = get_row_rounding(src0->type); + + if (id != 0) { + row_low[id] = ne01*g_tensor_split[id]; + if (row_low[id] < ne01) { + row_low[id] -= row_low[id] % rounding; + } + } + + if (id != g_device_count - 1) { + row_high[id] = ne01*g_tensor_split[id + 1]; + if (row_high[id] < ne01) { + row_high[id] -= row_high[id] % rounding; + } + } + } + } + + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } + + used_devices++; + + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + + ggml_cuda_set_device(id); + const dpct::queue_ptr stream = g_cudaStreams[id][0]; + + if (src0_on_device && src0_is_contiguous) { + src0_dd[id] = (char *) src0_extra->data_device[id]; + } else { + // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); + src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]); + } + + if (src1_on_device && src1_is_contiguous) { + src1_ddf[id] = (float *) src1_extra->data_device[id]; + } else { + src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]); + } + + if (convert_src1_to_q8_1) { + src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); + + if (src1_on_device && src1_is_contiguous) { + quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); + /* + DPCT1010:105: SYCL uses exceptions to report errors and does not + use the error codes. The call was replaced with 0. You need to + rewrite this code. + */ + CUDA_CHECK(0); + } + } + + if (dst_on_device) { + dst_dd[id] = (float *) dst_extra->data_device[id]; + } else { + const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst); + dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]); + } + } + + // if multiple devices are used they need to wait for the main device + // here an event is recorded that signals that the main device has finished calculating the input data + if (split && used_devices > 1) { + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + /* + DPCT1024:106: The original code returned the error code that was further + consumed by the program logic. This original code was replaced with 0. + You may need to rewrite the program logic consuming the error code. + */ + CUDA_CHECK(DPCT_CHECK_ERROR( + *src0_extra->events[g_main_device][0] = + g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier())); + } + + const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; + for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { + const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; + const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; + + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } + + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const int64_t row_diff = row_high[id] - row_low[id]; + + ggml_cuda_set_device(id); + const dpct::queue_ptr stream = g_cudaStreams[id][is]; + + // wait for main GPU data if necessary + if (split && (id != g_main_device || is != 0)) { + CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier( + {*src0_extra->events[g_main_device][0]}))); + } + + for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { + const int64_t i03 = i0 / ne12; + const int64_t i02 = i0 % ne12; + + const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; + + // for split tensors the data begins at i0 == i0_offset_low + char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; + float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10; + char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset; + float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); + + // the main device memory buffer can be on VRAM scratch, with space for all partial results + // in that case an offset on dst_ddf_i is needed + if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { + dst_dd_i += row_low[id]; // offset is 0 if no tensor split + } + + // copy src0, src1 to device if necessary + if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (id != g_main_device) { + if (convert_src1_to_q8_1) { + char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset; + CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy( + src1_ddq_i, src1_ddq_i_source, + src1_ncols * src1_padded_col_size * q8_1_ts / + q8_1_bs))); + } else { + float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; + src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; + CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy( + src1_ddf_i, src1_ddf_i_source, + src1_ncols * ne10 * sizeof(float)))); + } + } + } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d( + src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); + } else { + GGML_ASSERT(false); + } + + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { + quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); + /* + DPCT1010:107: SYCL uses exceptions to report errors and does + not use the error codes. The call was replaced with 0. You + need to rewrite this code. + */ + CUDA_CHECK(0); + } + + if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream)); + } + + // do the computation + op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, + row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream); + /* + DPCT1010:108: SYCL uses exceptions to report errors and does not + use the error codes. The call was replaced with 0. You need to + rewrite this code. + */ + CUDA_CHECK(0); + + // copy dst to host or other device if necessary + if (!dst_on_device) { + void * dst_off_device; + dpct::memcpy_direction kind; + if (dst->backend == GGML_BACKEND_CPU) { + dst_off_device = dst->data; + kind = dpct::device_to_host; + } else if (dst->backend == GGML_BACKEND_GPU) { + dst_off_device = dst_extra->data_device[g_main_device]; + kind = dpct::device_to_device; + } else { + GGML_ASSERT(false); + } + if (split) { + // src0 = weight matrix is saved as a transposed matrix for better memory layout. + // dst is NOT transposed. + // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. + // Instead they need to be copied to the correct slice in ne0 = dst row index. + // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0 + row_low[id]; + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy( + dhf_dst_i, ne0 * sizeof(float), dst_dd_i, + row_diff * sizeof(float), row_diff * sizeof(float), + src1_ncols, kind, *stream))); + } else { + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0; + CUDA_CHECK(DPCT_CHECK_ERROR( + stream->memcpy(dhf_dst_i, dst_dd_i, + src1_ncols * ne0 * sizeof(float)))); + } + } + + // add event for the main device to wait on until other device is done + if (split && (id != g_main_device || is != 0)) { + /* + DPCT1024:109: The original code returned the error code that + was further consumed by the program logic. This original + code was replaced with 0. You may need to rewrite the + program logic consuming the error code. + */ + CUDA_CHECK(DPCT_CHECK_ERROR( + *src0_extra->events[id][is] = + stream->ext_oneapi_submit_barrier())); + } + } + } + } + + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } + CUDA_CHECK(ggml_cuda_set_device(id)); + + // free buffers again when done + if (dst_as[id] > 0) { + ggml_cuda_pool_free(dst_dd[id], dst_as[id]); + } + if (src1_asq[id] > 0) { + ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]); + } + if (src1_asf[id] > 0) { + ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]); + } + if (src0_as[id] > 0) { + ggml_cuda_pool_free(src0_dd[id], src0_as[id]); + } + } + + // main device waits for all other devices to be finished + if (split && g_device_count > 1) { + int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; + is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + for (int64_t id = 0; id < g_device_count; ++id) { + if (row_low[id] == row_high[id]) { + continue; + } + for (int64_t is = 0; is < is_max; ++is) { + CUDA_CHECK(DPCT_CHECK_ERROR( + g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier( + {*src0_extra->events[id][is]}))); + } + } + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_current_device().queues_wait_and_throw())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat); +} + +static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows); +} + +static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); +} + +static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc); +} + +static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul); +} + +static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div); +} + +static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu); +} + +static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu); +} + +static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick); +} + +static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh); +} + +static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu); +} + +static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu); +} + +static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr); +} + +static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm); +} + +static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm); +} + +static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat); +} + +static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale); +} + +static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad); +} + +static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); +} + +bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + if (!g_cublas_loaded) return false; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); +} + +static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { + GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation + GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + const int64_t ne12 = src1->ne[2]; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + GGML_ASSERT(!ggml_is_permuted(src0)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + + const int64_t ne12 = src1->ne[2]; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + const int64_t row_stride_x = nb01 / sizeof(sycl::half); + const int64_t channel_stride_x = nb02 / sizeof(sycl::half); + + ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void k_compute_batched_ptrs(const sycl::half *src0_as_f16, + const sycl::half *src1_as_f16, char *dst, + const void **ptrs_src, void **ptrs_dst, + int64_t ne12, int64_t ne13, int64_t ne23, + size_t nb02, size_t nb03, size_t nb12, + size_t nb13, size_t nbd2, size_t nbd3, + int64_t r2, int64_t r3, + const sycl::nd_item<3> &item_ct1) { + int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (i13 >= ne13 || i12 >= ne12) { + return; + } + + int64_t i03 = i13 / r3; + int64_t i02 = i12 / r2; + + ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; + ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; +} + +static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00); + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02); + const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03); + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); + const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); + + const int64_t ne1 = ggml_nelements(src1); + const int64_t ne = ggml_nelements(dst); + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + CUBLAS_CHECK( + DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream)); + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + sycl::half *src0_as_f16 = (sycl::half *)src0_ddq; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + // convert src1 to fp16 + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + + cuda_pool_alloc src1_as_f16(ne1); + to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream); + + cuda_pool_alloc dst_f16; + char * dst_t; + + dpct::library_data_t cu_compute_type = CUBLAS_COMPUTE_16F; + dpct::library_data_t cu_data_type = dpct::library_data_t::real_half; + + // dst strides + size_t nbd2 = dst->nb[2]; + size_t nbd3 = dst->nb[3]; + + const sycl::half alpha_f16 = 1.0f; + const sycl::half beta_f16 = 0.0f; + + const float alpha_f32 = 1.0f; + const float beta_f32 = 0.0f; + + const void * alpha = &alpha_f16; + const void * beta = &beta_f16; + + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + dst_t = (char *) dst_f16.alloc(ne); + + nbd2 /= sizeof(float) / sizeof(sycl::half); + nbd3 /= sizeof(float) / sizeof(sycl::half); + } else { + dst_t = (char *) dst_ddf; + + cu_compute_type = CUBLAS_COMPUTE_32F; + cu_data_type = dpct::library_data_t::real_float; + + alpha = &alpha_f32; + beta = &beta_f32; + } + + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + +#if 0 + // use cublasGemmEx + { + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + int i03 = i13 / r3; + int i02 = i12 / r2; + + CUBLAS_CHECK( + cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), + (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), + beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, + cu_compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } + } +#else + if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) { + // there is no broadcast and src0, src1 are contiguous across dims 2, 3 + // use cublasGemmStridedBatchedEx + CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch( + g_cublas_handles, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, + (const char *)src0_as_f16, dpct::library_data_t::real_half, + nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half), + (const char *)src1_as_f16.get(), dpct::library_data_t::real_half, + nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta, + (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float), + ne12 * ne13, cu_compute_type))); + } else { + // use cublasGemmBatchedEx + const int ne23 = ne12*ne13; + + cuda_pool_alloc ptrs_src(2*ne23); + cuda_pool_alloc< void *> ptrs_dst(1*ne23); + + sycl::range<3> block_dims(1, ne12, ne13); + /* + DPCT1049:62: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(main_stream->get_device(), + {sycl::aspect::fp16}); + main_stream->submit([&](sycl::handler &cgh) { + const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get(); + const void **ptrs_src_get_ct3 = ptrs_src.get(); + void **ptrs_dst_get_ct4 = ptrs_dst.get(); + + cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_compute_batched_ptrs( + src0_as_f16, src1_as_f16_get_ct1, + dst_t, ptrs_src_get_ct3, + ptrs_dst_get_ct4, ne12, ne13, ne23, + nb02, nb03, nb12, nb13, nbd2, nbd3, r2, + r3, item_ct1); + }); + }); + } + /* + DPCT1010:110: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this + code. + */ + CUDA_CHECK(0); + + CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch( + g_cublas_handles, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, + (const void **)(ptrs_src.get() + 0 * ne23), + dpct::library_data_t::real_half, nb01 / sizeof(sycl::half), + (const void **)(ptrs_src.get() + 1 * ne23), + dpct::library_data_t::real_half, nb11 / sizeof(float), beta, + (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, + cu_compute_type))); + } +#endif + + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const bool all_on_device = + (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && + (src1->backend == GGML_BACKEND_GPU) && + ( dst->backend == GGML_BACKEND_GPU); + + const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + + int64_t min_compute_capability = INT_MAX; + for (int64_t id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + min_compute_capability = g_device_caps[id].cc; + } + } + +#ifdef CUDA_USE_TENSOR_CORES + const bool use_tensor_cores = true; +#else + const bool use_tensor_cores = false; +#endif + + // debug helpers + //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); + //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]); + //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); + //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); + //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); + + if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + // KQ single-batch + ggml_cuda_mul_mat_vec_p021(src0, src1, dst); + } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + // KQV single-batch + ggml_cuda_mul_mat_vec_nc(src0, src1, dst); + } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + // KQ + KQV multi-batch + ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); + } else if (src0->type == GGML_TYPE_F32) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); + } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { + if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { +#ifdef GGML_CUDA_FORCE_DMMV + const bool use_mul_mat_vec_q = false; +#else + const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; +#endif // GGML_CUDA_FORCE_DMMV + + if (use_mul_mat_vec_q) { + // NOTE: this kernel does not support ggml_nrows(src1) > 1 + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); + } else { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + } + } else { + bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); + + // when tensor cores are available, use them for large batch size + // ref: https://github.com/ggerganov/llama.cpp/pull/3776 + if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) { + use_mul_mat_q = false; + } + + if (use_mul_mat_q) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); + } else { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); + } + } + } else { + GGML_ASSERT(false); + } +} + +#if 0 +template +static __global__ void k_compute_batched_ptrs_id( + const void ** ptrs_src, void ** ptrs_dst, + int ne12, int ne13, + int ne23, + int nb02, int nb03, + int nb12, int nb13, + int nb2, int nb3, + int r2, int r3, + ggml_type src0_type, half * src0_as_f16, int64_t src0_ne, + const half * src1_f16, half * dst_f16, + const int32_t * ids, const int id, + Srcs... src0s) { + + int i = ids[id]; + + half * src0_f16; + const void * srcs_ar[] = { (const half *) src0s... }; + if (src0_type == GGML_TYPE_F16) { + src0_f16 = (half *) srcs_ar[i]; + } else { + src0_f16 = src0_as_f16; + if (threadIdx.x == 0 && threadIdx.y == 0) { + const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type); + to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget); + } + } + + int i13 = blockIdx.x * blockDim.x + threadIdx.x; + int i12 = blockIdx.y * blockDim.y + threadIdx.y; + + if (i13 >= ne13 || i12 >= ne12) { + return; + } + + int i03 = i13 / r3; + int i02 = i12 / r2; + + ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03; + ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2; +} + +static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) { + const struct ggml_tensor * ids = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src00 = dst->src[2]; + + const int id = dst->op_params[0]; + + GGML_ASSERT(!ggml_is_transposed(src00)); + GGML_ASSERT(!ggml_is_transposed(src1)); + + GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); + const int64_t ne01 = src00->ne[1]; + const int64_t ne02 = src00->ne[2]; + const int64_t ne03 = src00->ne[3]; + + //const int64_t nb01 = src00->nb[1]; + const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02); + const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03); + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + //const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); + const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); + + const int64_t ne1 = ggml_nelements(src1); + const int64_t ne = ggml_nelements(dst); + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream)); + + //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + //void * src0_ddq = src0_extra->data_device[g_main_device]; + //half * src0_as_f16 = (half *) src0_ddq; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + // convert src1 to fp16 + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + + size_t src1_as = 0; + half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as); + to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); + + size_t dst_as = 0; + half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const half alpha_f16 = 1.0f; + const half beta_f16 = 0.0f; + + // use cublasGemmBatchedEx + const int ne23 = ne12*ne13; + + const void ** ptrs_src = nullptr; + void ** ptrs_dst = nullptr; + + size_t ptrs_src_s = 0; + size_t ptrs_dst_s = 0; + + ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s); + ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s); + + int64_t src0_ne = ggml_nelements(src00); + half * src0_as_f16 = nullptr; + size_t src0_as = 0; + if (src00->type != GGML_TYPE_F16) { + src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as); + } + + static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6"); + dim3 block_dims(ne13, ne12); + k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>( + ptrs_src, ptrs_dst, + ne12, ne13, + ne23, + ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half), + nb12, nb13, + dst->nb[2], dst->nb[3], + r2, r3, + src00->type, src0_as_f16, src0_ne, + src1_as_f16, dst_f16, + (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id, + dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr, + dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr, + dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr, + dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr + ); + CUDA_CHECK(cudaGetLastError()); + + CUBLAS_CHECK( + cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00, + (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10, + &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01, + ne23, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + if (src0_as != 0) { + ggml_cuda_pool_free(src0_as_f16, src0_as); + } + if (ptrs_src_s != 0) { + ggml_cuda_pool_free(ptrs_src, ptrs_src_s); + } + if (ptrs_dst_s != 0) { + ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s); + } + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + + ggml_cuda_pool_free(src1_as_f16, src1_as); + ggml_cuda_pool_free(dst_f16, dst_as); +} +#endif + +static void ggml_cuda_mul_mat_id(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { +#if 0 + ggml_cuda_mul_mat_id_cublas(dst); + // TODO: mmq/mmv support +#endif + + const int64_t nb11 = src1->nb[1]; + const int64_t nb1 = dst->nb[1]; + + const struct ggml_tensor * ids = src0; + const int32_t id = ((int32_t *) dst->op_params)[0]; + const int32_t n_as = ((int32_t *) dst->op_params)[1]; + + std::vector ids_host(ggml_nbytes(ids)); + + const dpct::queue_ptr stream = g_cudaStreams[g_main_device][0]; + + if (ids->backend == GGML_BACKEND_GPU) { + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + CUDA_CHECK(DPCT_CHECK_ERROR( + stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); + CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait())); + } else { + memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); + } + + const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; + const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; + + ggml_tensor_extra_gpu src1_row_extra; + ggml_tensor_extra_gpu dst_row_extra; + + ggml_tensor src1_row = *src1; + ggml_tensor dst_row = *dst; + + src1_row.backend = GGML_BACKEND_GPU; + dst_row.backend = GGML_BACKEND_GPU; + + src1_row.extra = &src1_row_extra; + dst_row.extra = &dst_row_extra; + + char * src1_original = src1->backend == GGML_BACKEND_CPU ? + (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; + char * dst_original = dst->backend == GGML_BACKEND_CPU ? + (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; + + if (src1->ne[1] == 1) { + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + //int32_t row_id; + //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + + const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; + src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? + + dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; + dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? + + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + } + } else { + cuda_pool_alloc src1_contiguous(sizeof(float)*ggml_nelements(src1)); + cuda_pool_alloc dst_contiguous(sizeof(float)*ggml_nelements(dst)); + + src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); + dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); + + const dpct::memcpy_direction src1_kind = + src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device + : dpct::device_to_device; + const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU + ? dpct::device_to_host + : dpct::device_to_device; + + for (int32_t row_id = 0; row_id < n_as; ++row_id) { + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + int64_t num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(DPCT_CHECK_ERROR( + stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11, + src1_original + i01 * nb11, nb11))); + num_src1_rows++; + } + + if (num_src1_rows == 0) { + continue; + } + + src1_row.ne[1] = num_src1_rows; + dst_row.ne[1] = num_src1_rows; + + src1_row.nb[1] = nb11; + src1_row.nb[2] = num_src1_rows*nb11; + src1_row.nb[3] = num_src1_rows*nb11; + + dst_row.nb[1] = nb1; + dst_row.nb[2] = num_src1_rows*nb1; + dst_row.nb[3] = num_src1_rows*nb1; + + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + + num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy( + dst_original + i01 * nb1, + dst_contiguous.get() + num_src1_rows * nb1, nb1))); + num_src1_rows++; + } + } + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); +} + +static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp); +} + +static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) try { + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + GGML_ASSERT(src0->ne[3] == 1); + + const int64_t nb00 = src0->nb[0]; + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(src1->ne[3] == 1); + + const int64_t nb10 = src1->nb[0]; + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else { + fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } + + (void) dst; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + // TODO: why do we pass dst as src1 here? + ggml_cuda_cpy(src0, dst, nullptr); + (void) src1; +} + +static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf); +} + +static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max); +} + +static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope); +} + +static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); +} + +static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col); +} + +static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows); +} + +static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort); +} + +static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; +} + +static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); +} + +void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try { + const int64_t nrows = ggml_nrows(tensor); + + const int64_t ne0 = tensor->ne[0]; + + const size_t nb1 = tensor->nb[1]; + + ggml_backend_type backend = tensor->backend; + ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); + + for (int64_t id = 0; id < g_device_count; ++id) { + if (backend == GGML_BACKEND_GPU && id != g_main_device) { + continue; + } + + ggml_cuda_set_device(id); + + int64_t row_low, row_high; + if (backend == GGML_BACKEND_GPU) { + row_low = 0; + row_high = nrows; + } else if (backend == GGML_BACKEND_GPU_SPLIT) { + const int64_t rounding = get_row_rounding(tensor->type); + + row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; + row_low -= row_low % rounding; + + if (id == g_device_count - 1) { + row_high = nrows; + } else { + row_high = nrows*g_tensor_split[id + 1]; + row_high -= row_high % rounding; + } + } else { + GGML_ASSERT(false); + } + if (row_low == row_high) { + continue; + } + + int64_t nrows_split = row_high - row_low; + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf; + CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device( + size, dpct::get_in_order_queue()))); + char * buf_host = (char *)data + offset_split; + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_in_order_queue() + .memset(buf + original_size, 0, size - original_size) + .wait())); + } + + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue() + .memcpy(buf, buf_host, original_size) + .wait())); + + extra->data_device[id] = buf; + + if (backend == GGML_BACKEND_GPU_SPLIT) { + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] = + new sycl::event())); + } + } + } + + tensor->extra = extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_free_data(struct ggml_tensor *tensor) try { + if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { + return; + } + + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + + for (int64_t id = 0; id < g_device_count; ++id) { + if (extra->data_device[id] != nullptr) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free( + extra->data_device[id], dpct::get_in_order_queue()))); + } + + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + if (extra->events[id][is] != nullptr) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::destroy_event(extra->events[id][is]))); + } + } + } + + delete extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; +static size_t g_temp_tensor_extra_index = 0; + +static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (g_temp_tensor_extras == nullptr) { + g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; + } + + size_t alloc_index = g_temp_tensor_extra_index; + g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; + ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; +} + +static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor, + bool scratch, bool force_inplace, + bool no_alloc) try { + if (scratch && g_scratch_size == 0) { + return; + } + + tensor->backend = GGML_BACKEND_GPU; + + // recursively assign CUDA buffers until a compute tensor is found + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { + const ggml_op src0_op = tensor->src[0]->op; + if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { + ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); + } + } + if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { + ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); + } + + if (scratch && no_alloc) { + return; + } + + ggml_tensor_extra_gpu * extra; + + const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || + tensor->op == GGML_OP_VIEW || + force_inplace; + const size_t size = ggml_nbytes(tensor); + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + size_t offset = 0; + if (tensor->op == GGML_OP_VIEW) { + memcpy(&offset, tensor->op_params, sizeof(size_t)); + } + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = src0_ddc + offset; + } else if (tensor->op == GGML_OP_CPY) { + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; + void * src1_ddv = src1_extra->data_device[g_main_device]; + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = src1_ddv; + } else if (scratch) { + GGML_ASSERT(size <= g_scratch_size); + if (g_scratch_offset + size > g_scratch_size) { + g_scratch_offset = 0; + } + + char * data = (char *) g_scratch_buffer; + if (data == nullptr) { + CUDA_CHECK(DPCT_CHECK_ERROR( + data = (char *)sycl::malloc_device( + g_scratch_size, dpct::get_in_order_queue()))); + g_scratch_buffer = data; + } + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = data + g_scratch_offset; + + g_scratch_offset += size; + + GGML_ASSERT(g_scratch_offset <= g_scratch_size); + } else { // allocate new buffers outside of scratch + void * data; + CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device( + size, dpct::get_in_order_queue()))); + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_in_order_queue().memset(data, 0, size).wait())); + extra = new ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); + extra->data_device[g_main_device] = data; + } + + tensor->extra = extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor, + size_t offset) try { + if (g_scratch_size == 0) { + return; + } + if (g_scratch_buffer == nullptr) { + ggml_cuda_set_device(g_main_device); + CUDA_CHECK( + DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device( + g_scratch_size, dpct::get_in_order_queue()))); + } + + ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); + + const bool inplace = tensor->view_src != nullptr; + + if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + size_t view_offset = 0; + if (tensor->op == GGML_OP_VIEW) { + memcpy(&view_offset, tensor->op_params, sizeof(size_t)); + } + extra->data_device[g_main_device] = src0_ddc + view_offset; + } else { + extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset; + } + + tensor->extra = extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try { + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(ggml_is_contiguous(tensor)); + + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue() + .memcpy(extra->data_device[g_main_device], + tensor->data, ggml_nbytes(tensor)) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, true, false, false); +} + +void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, true, false, true); +} + +void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, false, false, false); +} + +void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, false, true, false); +} + +void ggml_cuda_set_main_device(const int main_device) try { + if (main_device >= g_device_count) { + fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", + main_device, g_device_count, g_main_device); + return; + } + + if (g_main_device != main_device && g_device_count > 1) { + g_main_device = main_device; + dpct::device_info prop; + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(g_main_device)))); + fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, + g_main_device, prop.get_name()); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_set_scratch_size(const size_t scratch_size) { + // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously + // it still won't always work as expected, but it's better than nothing + if (scratch_size > g_scratch_size) { + ggml_cuda_free_scratch(); + } + g_scratch_size = std::max(g_scratch_size, scratch_size); +} + +void ggml_cuda_free_scratch() try { + if (g_scratch_buffer == nullptr) { + return; + } + + CUDA_CHECK(DPCT_CHECK_ERROR( + sycl::free(g_scratch_buffer, dpct::get_in_order_queue()))); + g_scratch_buffer = nullptr; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + if (!g_cublas_loaded) return false; + + ggml_cuda_func_t func; + const bool any_on_device = tensor->backend == GGML_BACKEND_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + + if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { + return false; + } + + if (tensor->op == GGML_OP_MUL_MAT) { + if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { +#ifndef NDEBUG + fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); +#endif + return false; + } + } + + switch (tensor->op) { + case GGML_OP_REPEAT: + func = ggml_cuda_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_cuda_get_rows; + break; + case GGML_OP_DUP: + func = ggml_cuda_dup; + break; + case GGML_OP_ADD: + func = ggml_cuda_add; + break; + case GGML_OP_ACC: + func = ggml_cuda_acc; + break; + case GGML_OP_MUL: + func = ggml_cuda_mul; + break; + case GGML_OP_DIV: + func = ggml_cuda_div; + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_cuda_gelu; + break; + case GGML_UNARY_OP_SILU: + func = ggml_cuda_silu; + break; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_cuda_gelu_quick; + break; + case GGML_UNARY_OP_TANH: + func = ggml_cuda_tanh; + break; + case GGML_UNARY_OP_RELU: + func = ggml_cuda_relu; + break; + default: + return false; + } + break; + case GGML_OP_NORM: + func = ggml_cuda_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_cuda_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_cuda_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_cuda_upscale; + break; + case GGML_OP_PAD: + func = ggml_cuda_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_cuda_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_cuda_rms_norm; + break; + case GGML_OP_MUL_MAT: + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { + return false; + } + func = ggml_cuda_mul_mat; + break; + case GGML_OP_MUL_MAT_ID: + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) { + return false; + } + func = ggml_cuda_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_cuda_scale; + break; + case GGML_OP_SQR: + func = ggml_cuda_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_cuda_clamp; + break; + case GGML_OP_CPY: + func = ggml_cuda_cpy; + break; + case GGML_OP_CONT: + func = ggml_cuda_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_cuda_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_cuda_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_cuda_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_cuda_rope; + break; + case GGML_OP_ALIBI: + func = ggml_cuda_alibi; + break; + case GGML_OP_IM2COL: + func = ggml_cuda_im2col; + break; + case GGML_OP_SUM_ROWS: + func = ggml_cuda_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_cuda_argsort; + break; + default: + return false; + } + + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); + } + + if (params->ith != 0) { + return true; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return true; + } + func(tensor->src[0], tensor->src[1], tensor); + return true; +} + +int ggml_cuda_get_device_count() try { + int device_count; + if (DPCT_CHECK_ERROR(device_count = + dpct::dev_mgr::instance().device_count()) != 0) { + return 0; + } + return device_count; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_get_device_description(int device, char *description, + size_t description_size) try { + dpct::device_info prop; + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(device)))); + snprintf(description, description_size, "%s", prop.get_name()); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +#define UNUSED GGML_UNUSED + +// cuda buffer + +struct ggml_backend_buffer_context_cuda { + int device; + void * dev_ptr = nullptr; + ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; + size_t temp_tensor_extra_index = 0; + + ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} + + ~ggml_backend_buffer_context_cuda() { + delete[] temp_tensor_extras; + } + + ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (temp_tensor_extras == nullptr) { + temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; + } + + size_t alloc_index = temp_tensor_extra_index; + temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; + ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; + } +}; + +static void +ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + CUDA_CHECK( + DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue()))); + delete ctx; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + return ctx->dev_ptr; +} + +static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor) try { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + if (tensor->view_src != NULL && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->buft == buffer->buft); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + + ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra(); + + extra->data_device[ctx->device] = tensor->data; + + tensor->backend = GGML_BACKEND_GPU; + tensor->extra = extra; + + if (ggml_is_quantized(tensor->type)) { + // initialize padding to 0 to avoid possible NaN values + int64_t row_low = 0; + int64_t row_high = ggml_nrows(tensor); + int64_t nrows_split = row_high - row_low; + + size_t original_size = ggml_nbytes_split(tensor, nrows_split); + size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + + if (padded_size > original_size && tensor->view_src == nullptr) { + CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset( + (char *)tensor->data + original_size, 0, + padded_size - original_size))); + } + } + + UNUSED(buffer); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor, + const void *data, size_t offset, + size_t size) try { + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK( + DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())); + + CUDA_CHECK( + DPCT_CHECK_ERROR(dpct::get_in_order_queue() + .memcpy((char *)tensor->data + offset, data, size) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor *tensor, + void *data, size_t offset, + size_t size) try { + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK( + DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())); + + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_in_order_queue() + .memcpy(data, (const char *)tensor->data + offset, size) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, + uint8_t value) try { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK( + DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())); + + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue() + .memset(ctx->dev_ptr, value, buffer->size) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { + /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, + /* .cpy_tensor_from = */ NULL, + /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_cuda_buffer_clear, +}; + +// cuda buffer type + +static ggml_backend_buffer_t +ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) try { + int device = (int) (intptr_t) buft->context; + + ggml_cuda_set_device(device); + + size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 + + void * dev_ptr; + CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device( + size, dpct::get_in_order_queue()))); + + ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr); + + return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + + UNUSED(buft); +} + +static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { + int64_t row_low = 0; + int64_t row_high = ggml_nrows(tensor); + int64_t nrows_split = row_high - row_low; + + size_t size = ggml_nbytes_split(tensor, nrows_split); + + int64_t ne0 = tensor->ne[0]; + + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return size; + + UNUSED(buft); +} + +static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_cuda(backend); + + UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { + /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, + /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, + /* .is_host = */ nullptr, +}; + +ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + + static bool ggml_backend_cuda_buffer_type_initialized = false; + + if (!ggml_backend_cuda_buffer_type_initialized) { + for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { + ggml_backend_cuda_buffer_types[i] = { + /* .iface = */ ggml_backend_cuda_buffer_type_interface, + /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, + }; + } + ggml_backend_cuda_buffer_type_initialized = true; + } + + return &ggml_backend_cuda_buffer_types[device]; +} + +// host buffer type + +static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_cuda_host_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_cuda_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + // FIXME: this is a hack to avoid having to implement a new buffer type + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_cuda_buffer_type_host; +} + +// backend + +struct ggml_backend_context_cuda { + int device; +}; + +static const char * ggml_backend_cuda_name(ggml_backend_t backend) { + return GGML_CUDA_NAME; + + UNUSED(backend); +} + +static void ggml_backend_cuda_free(ggml_backend_t backend) { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + delete cuda_ctx; + delete backend; +} + +static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + return ggml_backend_cuda_buffer_type(cuda_ctx->device); +} + +static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, + ggml_tensor *tensor, + const void *data, size_t offset, + size_t size) try { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy( + (char *)tensor->data + offset, data, size))); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, + const ggml_tensor *tensor, + void *data, size_t offset, + size_t size) try { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy( + data, (const char *)tensor->data + offset, size))); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait())); + + UNUSED(backend); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { + GGML_ASSERT(!"not implemented"); + + return nullptr; + + UNUSED(backend); + UNUSED(cgraph); +} + +static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + GGML_ASSERT(!"not implemented"); + + UNUSED(backend); + UNUSED(plan); +} + +static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + GGML_ASSERT(!"not implemented"); + + UNUSED(backend); + UNUSED(plan); +} + +static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + ggml_cuda_set_main_device(cuda_ctx->device); + + ggml_compute_params params = {}; + params.type = GGML_TASK_COMPUTE; + params.ith = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) + continue; + + assert(node->backend == GGML_BACKEND_GPU); + assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + assert(node->extra != nullptr); + + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] != nullptr) { + assert(node->src[j]->backend == GGML_BACKEND_GPU); + assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + assert(node->src[j]->extra != nullptr); + } + } + + bool ok = ggml_cuda_compute_forward(¶ms, node); + if (!ok) { + fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + GGML_ASSERT(ok); + +#if 0 + if (node->type == GGML_TYPE_F32) { + cudaDeviceSynchronize(); + std::vector tmp(ggml_nelements(node), 0.0f); + cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); + printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), + ggml_type_name(node->src[0]->type), + node->src[1] ? ggml_type_name(node->src[1]->type) : "none", + node->src[0]->name, + node->src[1] ? node->src[1]->name : "none"); + double sum = 0.0; + double sq_sum = 0.0; + for (int i = 0; i < ggml_nelements(node); i++) { + printf("%f ", tmp[i]); + sum += tmp[i]; + sq_sum += tmp[i]*tmp[i]; + } + printf("\n"); + printf("sum: %f, ", sum); + printf("sq_sum: %f\n", sq_sum); + } +#endif + } + + UNUSED(backend); +} + +static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_TANH: + return true; + default: + return false; + } + break; + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + { + struct ggml_tensor * a; + struct ggml_tensor * b; + if (op->op == GGML_OP_MUL_MAT) { + a = op->src[0]; + b = op->src[1]; + } else { + a = op->src[2]; + b = op->src[1]; + } + if (a->ne[3] != b->ne[3]) { + return false; + } + return true; + } break; + case GGML_OP_GET_ROWS: + { + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } break; + case GGML_OP_CPY: + { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return true; + } + return false; + } break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + case GGML_OP_REPEAT: + case GGML_OP_DUP: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_CONT: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_ALIBI: + case GGML_OP_IM2COL: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_CONCAT: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_LEAKY_RELU: + return true; + default: + return false; + } + + UNUSED(backend); +} + +static ggml_backend_i cuda_backend_i = { + /* .get_name = */ ggml_backend_cuda_name, + /* .free = */ ggml_backend_cuda_free, + /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, + /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ ggml_backend_cuda_synchronize, + /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, + /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, + /* .graph_compute = */ ggml_backend_cuda_graph_compute, + /* .supports_op = */ ggml_backend_cuda_supports_op, +}; + +ggml_backend_t ggml_backend_cuda_init(int device) { + ggml_init_cublas(); // TODO: remove from ggml.c + + if (device < 0 || device >= ggml_cuda_get_device_count()) { + fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); + return nullptr; + } + + // not strictly necessary, but it may reduce the overhead of the first graph_compute + ggml_cuda_set_main_device(device); + + ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda { + /* .device = */ device + }; + + ggml_backend_t cuda_backend = new ggml_backend { + /* .interface = */ cuda_backend_i, + /* .context = */ ctx + }; + + return cuda_backend; +} + +bool ggml_backend_is_cuda(ggml_backend_t backend) { + return backend->iface.get_name == ggml_backend_cuda_name; +} + +static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { + ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); + return cuda_backend; + + UNUSED(params); +} + +extern "C" int ggml_backend_cuda_reg_devices(); + +int ggml_backend_cuda_reg_devices() { + int device_count = ggml_cuda_get_device_count(); + //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization + for (int i = 0; i < device_count; i++) { + char name[128]; + snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i); + ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i); + } + return device_count; +} diff --git a/ggml-sycl.hpp b/ggml-sycl.hpp new file mode 100644 index 0000000000000..40710da2e8bc8 --- /dev/null +++ b/ggml-sycl.hpp @@ -0,0 +1,4 @@ +#include +#include +typedef half ggml_fp16_t; + diff --git a/ggml.h b/ggml.h index dca7bd9ceb0d5..533f40c9f8f16 100644 --- a/ggml.h +++ b/ggml.h @@ -2283,7 +2283,7 @@ extern "C" { typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); - typedef struct { + typedef struct dpct_type_994041 { const char * type_name; int blck_size; size_t type_size; From 233876936b9f9671e57b6f5848d6ce9055caea08 Mon Sep 17 00:00:00 2001 From: jianyuzh Date: Thu, 28 Dec 2023 16:40:42 +0800 Subject: [PATCH 02/90] update init_cublas --- ggml-sycl.cpp | 2364 ++++++++++++++++++++++++++++++------------------- 1 file changed, 1456 insertions(+), 908 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 160cdf63a502f..e74902c98d5ce 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -117,7 +117,7 @@ #include -#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define MIN_CC_DP4A 510 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define CC_VOLTA 700 #define CC_OFFSET_AMD 1000000 #define CC_RDNA2 (CC_OFFSET_AMD + 1030) @@ -217,7 +217,7 @@ static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); #if DPCT_COMPAT_RT_VERSION >= 12000 static const char *cublas_get_error_str(const int err) { /* - DPCT1009:63: SYCL uses exceptions to report errors and does not use the + DPCT1009:57: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code. */ @@ -249,13 +249,13 @@ static void ggml_cuda_error(const char * stmt, const char * func, const char * f } /* -DPCT1001:65: The statement could not be removed. +DPCT1001:59: The statement could not be removed. */ /* -DPCT1000:66: Error handling if-stmt was detected but could not be rewritten. +DPCT1000:60: Error handling if-stmt was detected but could not be rewritten. */ /* -DPCT1009:67: SYCL uses exceptions to report errors and does not use the error +DPCT1009:61: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code. */ @@ -273,16 +273,16 @@ You need to rewrite this code. static const char *cu_get_error_str(int err) { const char * err_str; /* - DPCT1007:64: Migration of cuGetErrorString is not supported. + DPCT1007:58: Migration of cuGetErrorString is not supported. */ cuGetErrorString(err, &err_str); return err_str; } /* -DPCT1001:82: The statement could not be removed. +DPCT1001:76: The statement could not be removed. */ /* -DPCT1000:83: Error handling if-stmt was detected but could not be rewritten. +DPCT1000:77: Error handling if-stmt was detected but could not be rewritten. */ #define CU_CHECK(err) \ do { auto err_ = (err); \ @@ -573,14 +573,15 @@ struct ggml_tensor_extra_gpu { // probably because the Windows CUDA libraries forget to make this check before invoking the drivers inline dpct::err0 ggml_cuda_set_device(const int device) try { int current_device; - CUDA_CHECK(current_device = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK(DPCT_CHECK_ERROR( + current_device = dpct::dev_mgr::instance().current_device_id())); if (device == current_device) { return 0; } /* - DPCT1093:68: The "device" device may be not the one intended for use. Adjust + DPCT1093:62: The "device" device may be not the one intended for use. Adjust the selected device if needed. */ return DPCT_CHECK_ERROR(dpct::select_device(device)); @@ -624,13 +625,7 @@ static __dpct_inline__ float warp_reduce_sum(float x, #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { /* - DPCT1023:0: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ - /* - DPCT1096:113: The right-most dimension of the work-group used in the + DPCT1096:107: The right-most dimension of the work-group used in the SYCL kernel that calls this function may be less than "32". The function "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU device. Modify the size of the work-group to ensure that the value @@ -645,20 +640,8 @@ static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:1: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), mask); - /* - DPCT1023:2: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), mask); } @@ -670,13 +653,7 @@ static __dpct_inline__ float warp_reduce_max(float x, #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { /* - DPCT1023:3: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ - /* - DPCT1096:112: The right-most dimension of the work-group used in the + DPCT1096:106: The right-most dimension of the work-group used in the SYCL kernel that calls this function may be less than "32". The function "dpct::permute_sub_group_by_xor" may return an unexpected result on the CPU device. Modify the size of the work-group to ensure that the value @@ -907,7 +884,7 @@ static void norm_f32(const float * x, float * dst, const int ncols, const float s_sum[warp_id] = mean_var; } /* - DPCT1118:4: SYCL group functions and algorithms must be encountered in + DPCT1118:0: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ item_ct1.barrier(sycl::access::fence_space::local_space); @@ -1012,11 +989,11 @@ static void group_norm_f32(const float * x, float * dst, const int group_size, c s_sum[warp_id] = tmp; } /* - DPCT1118:5: SYCL group functions and algorithms must be encountered in + DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:69: Consider replacing sycl::nd_item::barrier() with + DPCT1065:63: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -1043,11 +1020,11 @@ static void group_norm_f32(const float * x, float * dst, const int group_size, c s_sum[warp_id] = tmp; } /* - DPCT1118:6: SYCL group functions and algorithms must be encountered in + DPCT1118:2: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:70: Consider replacing sycl::nd_item::barrier() with + DPCT1065:64: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -1087,7 +1064,7 @@ static void rms_norm_f32(const float * x, float * dst, const int ncols, const fl s_sum[warp_id] = tmp; } /* - DPCT1118:7: SYCL group functions and algorithms must be encountered in + DPCT1118:3: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ item_ct1.barrier(sycl::access::fence_space::local_space); @@ -1127,8 +1104,8 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib, const int iqs, dfloat2 &v) { const block_q4_1 * x = (const block_q4_1 *) vx; - const dfloat d = x[ib].dm[1]; - const dfloat m = x[ib].dm[0]; + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; const int vui = x[ib].qs[iqs]; @@ -1172,8 +1149,8 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib, const int iqs, dfloat2 &v) { const block_q5_1 * x = (const block_q5_1 *) vx; - const dfloat d = x[ib].dm[1]; - const dfloat m = x[ib].dm[0]; + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); @@ -1228,8 +1205,8 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri const uint8_t q = x[i].qs[32*n + l]; dst_t * y = yy + i*QK_K + 128*n; - float dall = x[i].dm[1]; - float dmin = x[i].dm[0]; + float dall = x[i].dm[0]; + float dmin = x[i].dm[1]; y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); @@ -1330,8 +1307,8 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri dst_t * y = yy + i*QK_K + 64*il + n*ir; - const float dall = x[i].dm[1]; - const float dmin = x[i].dm[0]; + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; const uint8_t * q = x[i].qs + 32*il + n*ir; @@ -1371,8 +1348,8 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri dst_t * y = yy + i*QK_K + 64*il + 2*ir; - const float dall = x[i].dm[1]; - const float dmin = x[i].dm[0]; + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; const uint8_t * ql = x[i].qs + 32*il + 2*ir; const uint8_t * qh = x[i].qh + 2*ir; @@ -1450,7 +1427,7 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri } /* -DPCT1110:8: The total declared local variable size in device function +DPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high @@ -1500,8 +1477,8 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; - const float dall = x[i].dm[1]; - const float dmin = x[i].dm[0]; + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); aux[0] = a[0] & 0x0f0f0f0f; @@ -1561,12 +1538,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:9: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -1577,7 +1548,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, } /* -DPCT1110:10: The total declared local variable size in device function +DPCT1110:5: The total declared local variable size in device function dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high @@ -1686,12 +1657,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:11: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -1702,7 +1667,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, } /* -DPCT1110:12: The total declared local variable size in device function +DPCT1110:6: The total declared local variable size in device function dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high @@ -1763,8 +1728,8 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm[1]; - const float dmin = x[i].dm[0]; + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1845,12 +1810,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:13: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -1861,7 +1820,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, } /* -DPCT1110:14: The total declared local variable size in device function +DPCT1110:7: The total declared local variable size in device function dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high @@ -1916,8 +1875,8 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm[1]; - const float dmin = x[i].dm[0]; + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; const uint16_t * a = (const uint16_t *)x[i].scales; aux[0] = a[im+0] & kmask1; @@ -1985,12 +1944,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:15: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -2106,12 +2059,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:16: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -2162,20 +2109,8 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:17: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor( item_ct1.get_sub_group(), amax, mask)); - /* - DPCT1023:18: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ sum += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask); } @@ -2300,9 +2235,9 @@ static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ #define VDR_Q4_0_Q8_1_MMQ 4 template -static __dpct_inline__ float -vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4, - const sycl::half2 &ds8, const sycl::stream &stream_ct1) { +static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, + const float &d4, + const sycl::half2 &ds8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2314,16 +2249,17 @@ vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4, const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); } - const float2 ds8f = __half22float2(ds8); + const sycl::float2 ds8f = + ds8.convert(); // second part effectively subtracts 8 from each quant value - return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); + return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y()); #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2331,9 +2267,9 @@ vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4, #define VDR_Q4_1_Q8_1_MMQ 4 template -static __dpct_inline__ float -vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4, - const sycl::half2 &ds8, const sycl::stream &stream_ct1) { +static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u, + const sycl::half2 &dm4, + const sycl::half2 &ds8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2345,8 +2281,8 @@ vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4, const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); } #ifdef GGML_CUDA_F16 @@ -2354,16 +2290,18 @@ vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4, const float d4d8 = tmp.x; const float m4s8 = tmp.y; #else - const float2 dm4f = __half22float2(dm4); - const float2 ds8f = __half22float2(ds8); - const float d4d8 = dm4f.x * ds8f.x; - const float m4s8 = dm4f.y * ds8f.y; + const sycl::float2 dm4f = + dm4.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d4d8 = dm4f.x() * ds8f.x(); + const float m4s8 = dm4f.y() * ds8f.y(); #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2373,8 +2311,7 @@ vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4, template static __dpct_inline__ float vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, - const float &d5, const sycl::half2 &ds8, - const sycl::stream &stream_ct1) { + const float &d5, const sycl::half2 &ds8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2387,22 +2324,25 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], + sumi); // SIMD dot product of quantized values int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + sumi = dpct::dp4a(vi1, u[2 * i + 1], + sumi); // SIMD dot product of quantized values } - const float2 ds8f = __half22float2(ds8); + const sycl::float2 ds8f = + ds8.convert(); // second part effectively subtracts 16 from each quant value - return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); + return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y()); #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2412,8 +2352,7 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, template static __dpct_inline__ float vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, - const sycl::half2 &dm5, const sycl::half2 &ds8, - const sycl::stream &stream_ct1) { + const sycl::half2 &dm5, const sycl::half2 &ds8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2426,14 +2365,16 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], + sumi); // SIMD dot product of quantized values int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + sumi = dpct::dp4a(vi1, u[2 * i + 1], + sumi); // SIMD dot product of quantized values } #ifdef GGML_CUDA_F16 @@ -2441,17 +2382,19 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, const float d5d8 = tmp.x; const float m5s8 = tmp.y; #else - const float2 dm5f = __half22float2(dm5); - const float2 ds8f = __half22float2(ds8); - const float d5d8 = dm5f.x * ds8f.x; - const float m5s8 = dm5f.y * ds8f.y; + const sycl::float2 dm5f = + dm5.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d5d8 = dm5f.x() * ds8f.x(); + const float m5s8 = dm5f.y() * ds8f.y(); #endif // GGML_CUDA_F16 // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it return sumi*d5d8 + m5s8 / (QI5_1 / vdr); #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2459,9 +2402,9 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, #define VDR_Q8_0_Q8_1_MMQ 8 template -static __dpct_inline__ float -vec_dot_q8_0_q8_1_impl(const int *v, const int *u, const float &d8_0, - const float &d8_1, const sycl::stream &stream_ct1) { +static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u, + const float &d8_0, + const float &d8_1) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2470,19 +2413,19 @@ vec_dot_q8_0_q8_1_impl(const int *v, const int *u, const float &d8_0, #pragma unroll for (int i = 0; i < vdr; ++i) { // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); + sumi = dpct::dp4a(v[i], u[i], sumi); } return d8_0*d8_1 * sumi; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template -static __dpct_inline__ float -vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8, - const sycl::half2 &ds8, const sycl::stream &stream_ct1) { +static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u, + const sycl::half2 &dm8, + const sycl::half2 &ds8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2491,7 +2434,7 @@ vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8, #pragma unroll for (int i = 0; i < vdr; ++i) { // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); + sumi = dpct::dp4a(v[i], u[i], sumi); } #ifdef GGML_CUDA_F16 @@ -2499,16 +2442,18 @@ vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8, const float d8d8 = tmp.x; const float m8s8 = tmp.y; #else - const float2 dm8f = __half22float2(dm8); - const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8f.x * ds8f.x; - const float m8s8 = dm8f.y * ds8f.y; + const sycl::float2 dm8f = + dm8.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d8d8 = dm8f.x() * ds8f.x(); + const float m8s8 = dm8f.y() * ds8f.y(); #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2518,8 +2463,7 @@ vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8, // contiguous v/x values static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales, - const sycl::half2 &dm2, const float *__restrict__ d8, - const sycl::stream &stream_ct1) { + const sycl::half2 &dm2, const float *__restrict__ d8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2532,20 +2476,25 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( const int vi = (v >> (2*i)) & 0x03030303; - sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + sumf_d += + d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product // fill int with 4x m int m = sc >> 4; m |= m << 8; m |= m << 16; - sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + sumf_m += d8[i] * + dpct::dp4a( + m, u[i], + 0); // multiply constant q2_K part with sum of q8_1 values } - const float2 dm2f = __half22float2(dm2); + const sycl::float2 dm2f = + dm2.convert(); - return dm2f.x*sumf_d - dm2f.y*sumf_m; + return dm2f.x() * sumf_d - dm2f.y() * sumf_m; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2553,8 +2502,7 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, const uint8_t *__restrict__ scales, - const sycl::half2 &dm2, const float &d8, - const sycl::stream &stream_ct1) { + const sycl::half2 &dm2, const float &d8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2574,18 +2522,20 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, #pragma unroll for (int i = i0; i < i0 + QI8_1/2; ++i) { - sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product - sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m + sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = dpct::dp4a(m, u[i], + sumi_m); // multiply sum of q8_1 values with m } sumi_d += sumi_d_sc * (sc & 0xF); } - const float2 dm2f = __half22float2(dm2); + const sycl::float2 dm2f = + dm2.convert(); - return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); + return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m); #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2596,8 +2546,7 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( const int &vl, const int &vh, const int *__restrict__ u, const uint8_t *__restrict__ scales, const int &scale_offset, - const float &d3, const float *__restrict__ d8, - const sycl::stream &stream_ct1) { + const float &d3, const float *__restrict__ d8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2621,14 +2570,15 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( const int vih = ((vh >> i) << 2) & 0x04040404; - const int vi = __vsubss4(vil, vih); + const int vi = + dpct::vectorized_binary(vil, vih, dpct::sub_sat()); - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product } return d3 * sumf; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2636,7 +2586,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, const int8_t *__restrict__ scales, const float &d3, - const float &d8, const sycl::stream &stream_ct1) { + const float &d8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2647,7 +2597,7 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, int sumi_sc = 0; for (int i = i0; i < i0 + QI8_1/2; ++i) { - sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product + sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product } sumi += sumi_sc * scales[i0 / (QI8_1/2)]; @@ -2655,7 +2605,7 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, return d3*d8 * sumi; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2666,8 +2616,7 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( const int *__restrict__ v, const int *__restrict__ u, const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, - const sycl::half2 &dm4, const float *__restrict__ d8, - const sycl::stream &stream_ct1) { + const sycl::half2 &dm4, const float *__restrict__ d8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2679,19 +2628,24 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; - const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + const int dot1 = + dpct::dp4a(v1i, u[2 * i + 1], + dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product + const int dot2 = + dpct::dp4a(0x01010101, u[2 * i + 1], + dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u sumf_d += d8[i] * (dot1 * sc[i]); sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values } - const float2 dm4f = __half22float2(dm4); + const sycl::float2 dm4f = + dm4.convert(); - return dm4f.x*sumf_d - dm4f.y*sumf_m; + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2699,8 +2653,7 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( const int *__restrict__ v, const int *__restrict__ u, const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, - const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8, - const sycl::stream &stream_ct1) { + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2713,21 +2666,24 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( #pragma unroll for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product + sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F, + u[i * QI8_1 + j], sumi_d); // SIMD dot product } - const float2 ds8f = __half22float2(ds8[i]); + const sycl::float2 ds8f = + ds8[i].convert(); - sumf_d += ds8f.x * (sc[i] * sumi_d); - sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + sumf_d += ds8f.x() * (sc[i] * sumi_d); + sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val } - const float2 dm4f = __half22float2(dm4); + const sycl::float2 dm4f = + dm4.convert(); - return dm4f.x*sumf_d - dm4f.y*sumf_m; + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2739,7 +2695,7 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int *__restrict__ vl, const int *__restrict__ vh, const int *__restrict__ u, const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, const sycl::half2 &dm5, - const float *__restrict__ d8, const sycl::stream &stream_ct1) { + const float *__restrict__ d8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2757,20 +2713,25 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int v0i = vl0i | vh0i; const int v1i = vl1i | vh1i; - const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u + const int dot1 = + dpct::dp4a(v0i, u[2 * i + 0], + dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product + const int dot2 = + dpct::dp4a(0x01010101, u[2 * i + 0], + dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u sumf_d += d8[i] * (dot1 * sc[i]); sumf_m += d8[i] * (dot2 * m[i]); } - const float2 dm5f = __half22float2(dm5); + const sycl::float2 dm5f = + dm5.convert(); - return dm5f.x*sumf_d - dm5f.y*sumf_m; + return dm5f.x() * sumf_d - dm5f.y() * sumf_m; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2778,8 +2739,7 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( const int *__restrict__ v, const int *__restrict__ u, const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, - const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8, - const sycl::stream &stream_ct1) { + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2792,21 +2752,24 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( #pragma unroll for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j], + sumi_d); // SIMD dot product } - const float2 ds8f = __half22float2(ds8[i]); + const sycl::float2 ds8f = + ds8[i].convert(); - sumf_d += ds8f.x * (sc[i] * sumi_d); - sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + sumf_d += ds8f.x() * (sc[i] * sumi_d); + sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val } - const float2 dm4f = __half22float2(dm4); + const sycl::float2 dm4f = + dm4.convert(); - return dm4f.x*sumf_d - dm4f.y*sumf_m; + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2814,10 +2777,11 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( #define VDR_Q6_K_Q8_1_MMQ 8 // contiguous v/x values -static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq( - const int &vl, const int &vh, const int *__restrict__ u, - const int8_t *__restrict__ scales, const float &d, - const float *__restrict__ d8, const sycl::stream &stream_ct1) { +static __dpct_inline__ float +vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, + const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d, + const float *__restrict__ d8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2831,14 +2795,15 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq( const int vih = ((vh >> (4*i)) << 4) & 0x30303030; - const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + const int vi = dpct::vectorized_binary( + (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32 - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product } return d*sumf; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -2846,8 +2811,7 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq( static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, const int8_t *__restrict__ sc, const float &d6, - const float *__restrict__ d8, - const sycl::stream &stream_ct1) { + const float *__restrict__ d8) { #if DPCT_COMPATIBILITY_TEMP >= \ MIN_CC_DP4A // lowest compute capability for integer intrinsics @@ -2855,31 +2819,35 @@ vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, #pragma unroll for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { - int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale #pragma unroll for (int i = i0; i < i0 + 2; ++i) { - sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product - sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product - - sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product - sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product + sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0], + sumi_d.x()); // SIMD dot product + sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1], + sumi_d.x()); // SIMD dot product + + sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4], + sumi_d.y()); // SIMD dot product + sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5], + sumi_d.y()); // SIMD dot product } - sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); + sumf_d += d8[i0 / 4] * + (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y()); } return d6 * sumf_d; #else - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } static __dpct_inline__ float vec_dot_q4_0_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; @@ -2893,8 +2861,7 @@ vec_dot_q4_0_q8_1(const void *__restrict__ vbq, u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); } - return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds, - stream_ct1); + return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); } template @@ -2961,7 +2928,7 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); @@ -2975,17 +2942,14 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat( u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; } - return vec_dot_q4_0_q8_1_impl( - &x_ql[i * (WARP_SIZE + 1) + k], u, - x_dmf[i * (WARP_SIZE / QI4_0) + i / QI4_0 + k / QI4_0], - y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)], - stream_ct1); + return vec_dot_q4_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __dpct_inline__ float vec_dot_q4_1_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; @@ -2999,8 +2963,7 @@ vec_dot_q4_1_q8_1(const void *__restrict__ vbq, u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); } - return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, - bq8_1->ds, stream_ct1); + return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); } template @@ -3065,7 +3028,7 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); @@ -3078,17 +3041,14 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat( u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; } - return vec_dot_q4_1_q8_1_impl( - &x_ql[i * (WARP_SIZE + 1) + k], u, - x_dm[i * (WARP_SIZE / QI4_1) + i / QI4_1 + k / QI4_1], - y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)], - stream_ct1); + return vec_dot_q4_1_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __dpct_inline__ float vec_dot_q5_0_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; @@ -3104,8 +3064,7 @@ vec_dot_q5_0_q8_1(const void *__restrict__ vbq, u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); } - return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, - bq8_1->ds, stream_ct1); + return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); } template @@ -3192,7 +3151,7 @@ static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); @@ -3208,16 +3167,13 @@ static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat( u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; } - return vec_dot_q8_0_q8_1_impl( - &x_ql[i * (2 * WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], - y_df[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)], - stream_ct1); + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __dpct_inline__ float vec_dot_q5_1_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; @@ -3233,8 +3189,7 @@ vec_dot_q5_1_q8_1(const void *__restrict__ vbq, u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); } - return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, - bq8_1->ds, stream_ct1); + return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); } template @@ -3316,7 +3271,7 @@ static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); @@ -3330,16 +3285,13 @@ static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat( u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; } - return vec_dot_q8_1_q8_1_impl( - &x_ql[i * (2 * WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], - y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)], - stream_ct1); + return vec_dot_q8_1_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __dpct_inline__ float vec_dot_q8_0_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; @@ -3353,7 +3305,7 @@ vec_dot_q8_0_q8_1(const void *__restrict__ vbq, } return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, - bq8_1->ds[1], stream_ct1); + bq8_1->ds[0]); } template @@ -3419,22 +3371,20 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; (void)x_sc; const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; - return vec_dot_q8_0_q8_1_impl( - &x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], - x_dmf[i * (WARP_SIZE / QI8_0) + i / QI8_0 + k / QI8_0], - y_df[j * (WARP_SIZE / QI8_1) + k / QI8_1], stream_ct1); + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], + y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); } static __dpct_inline__ float vec_dot_q2_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q2_K * bq2_K = (const block_q2_K *) vbq; @@ -3450,10 +3400,10 @@ vec_dot_q2_K_q8_1(const void *__restrict__ vbq, #pragma unroll for (int i = 0; i < QR2_K; ++ i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds[1]; + d8[i] = bq8_1[bq8_offset + i].ds[0]; } - return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8, stream_ct1); + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); } template @@ -3532,7 +3482,7 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; const int kbx = k / QI2_K; @@ -3552,16 +3502,12 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat( const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; - return vec_dot_q2_K_q8_1_impl_mmq( - v, &y_qs[index_y], scales, - x_dm[i * (WARP_SIZE / QI2_K) + i / QI2_K + kbx], y_df[index_y / QI8_1], - stream_ct1); + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); } static __dpct_inline__ float vec_dot_q3_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q3_K * bq3_K = (const block_q3_K *) vbq; @@ -3581,11 +3527,10 @@ vec_dot_q3_K_q8_1(const void *__restrict__ vbq, #pragma unroll for (int i = 0; i < QR3_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + i].ds[1]; + d8[i] = bq8_1[bq8_offset + i].ds[0]; } - return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, - d, d8, stream_ct1); + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); } template @@ -3692,7 +3637,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { const int kbx = k / QI3_K; const int ky = (k % QI3_K) * QR3_K; @@ -3716,16 +3661,12 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat( } const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; - return vec_dot_q3_K_q8_1_impl_mmq( - v, &y_qs[index_y], scales, - x_dmf[i * (WARP_SIZE / QI3_K) + i / QI3_K + kbx], y_df[index_y / QI8_1], - stream_ct1); + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); } static __dpct_inline__ float vec_dot_q4_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { #ifndef GGML_QKK_64 const block_q4_K * bq4_K = (const block_q4_K *) vbq; @@ -3761,14 +3702,14 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq, for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = bq8i->ds[1]; + d8[i] = bq8i->ds[0]; const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; u[2*i+1] = q8[4]; } - return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8, stream_ct1); + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); #else @@ -3905,22 +3846,19 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], - &y_qs[index_y], sc, sc + 8, - x_dm[i * (WARP_SIZE / QI4_K) + i / QI4_K], - &y_ds[index_y / QI8_1], stream_ct1); + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } static __dpct_inline__ float vec_dot_q5_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { #ifndef GGML_QKK_64 const block_q5_K * bq5_K = (const block_q5_K *) vbq; @@ -3963,8 +3901,7 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq, u[2*i+1] = q8[4]; } - return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8, - stream_ct1); + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); #else @@ -4108,23 +4045,20 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; - return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, - sc + 8, - x_dm[i * (WARP_SIZE / QI5_K) + i / QI5_K], - &y_ds[index_y / QI8_1], stream_ct1); + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } static __dpct_inline__ float vec_dot_q6_K_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const sycl::stream &stream_ct1) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { const block_q6_K * bq6_K = (const block_q6_K *) vbq; @@ -4143,11 +4077,10 @@ vec_dot_q6_K_q8_1(const void *__restrict__ vbq, #pragma unroll for (int i = 0; i < QR6_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = bq8_1[bq8_offset + 2 * i].ds[1]; + d8[i] = bq8_1[bq8_offset + 2 * i].ds[0]; } - return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8, - stream_ct1); + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); } template @@ -4244,7 +4177,7 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat( const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, const int *__restrict__ x_qh, const int *__restrict__ x_sc, const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, - const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) { + const int &i, const int &j, const int &k) { (void)x_qh; const float * x_dmf = (const float *) x_dm; @@ -4254,10 +4187,7 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat( const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; - return vec_dot_q6_K_q8_1_impl_mmq( - &x_ql[index_x], &y_qs[index_y], sc, - x_dmf[i * (WARP_SIZE / QI6_K) + i / QI6_K], &y_df[index_y / QI8_1], - stream_ct1); + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } template /* -DPCT1110:19: The total declared local variable size in device function mul_mat_q +DPCT1110:8: The total declared local variable size in device function mul_mat_q exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure. @@ -4274,7 +4204,8 @@ static __dpct_inline__ void mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, float *__restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::nd_item<3> &item_ct1, int *tile_y_qs, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, + sycl::half2 *tile_x_dm, int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { const block_q_t * x = (const block_q_t *) vx; @@ -4297,7 +4228,53 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, int * tile_x_qh = nullptr; int * tile_x_sc = nullptr; - allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + /* + DPCT1084:11: The function call "allocate_tiles_q4_0" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + /* + DPCT1084:12: The function call "allocate_tiles_q4_1" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + /* + DPCT1084:13: The function call "allocate_tiles_q5_0" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + /* + DPCT1084:14: The function call "allocate_tiles_q5_1" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + /* + DPCT1084:15: The function call "allocate_tiles_q8_0" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + /* + DPCT1084:16: The function call "allocate_tiles_q2_K" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + /* + DPCT1084:17: The function call "allocate_tiles_q3_K" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + /* + DPCT1084:18: The function call "allocate_tiles_q4_K" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + /* + DPCT1084:19: The function call "allocate_tiles_q5_K" has multiple migration + results in different template instantiations that could not be unified. You + may need to adjust the code. + */ + allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, tile_x_ql, + tile_x_dm, tile_x_sc); float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; @@ -4347,16 +4324,16 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, *dsi_dst = *dsi_src; } else { float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src)[1]; + *dfi_dst = (*dsi_src)[0]; } } /* - DPCT1118:20: SYCL group functions and algorithms must be encountered + DPCT1118:9: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:71: Consider replacing sycl::nd_item::barrier() with + DPCT1065:65: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -4377,11 +4354,11 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, } /* - DPCT1118:21: SYCL group functions and algorithms must be encountered + DPCT1118:10: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:72: Consider replacing sycl::nd_item::barrier() with + DPCT1065:66: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -4438,7 +4415,8 @@ template static void mul_mat_q4_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -4460,9 +4438,12 @@ template static void const int mmq_y = MMQ_Y_Q4_0_AMPERE; const int nwarps = NWARPS_Q4_0_AMPERE; - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, + vec_dot_q4_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q4_0_PASCAL; @@ -4474,7 +4455,7 @@ template static void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_0_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4503,12 +4484,13 @@ template static void __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) #endif // defined(RDNA3) || defined(RDNA2) #elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA - + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) #endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q4_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -4530,9 +4512,12 @@ template static void const int mmq_y = MMQ_Y_Q4_1_AMPERE; const int nwarps = NWARPS_Q4_1_AMPERE; - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, + vec_dot_q4_1_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q4_1_PASCAL; @@ -4544,7 +4529,7 @@ template static void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_1_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4576,7 +4561,8 @@ template static void mul_mat_q5_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -4598,9 +4584,12 @@ template static void const int mmq_y = MMQ_Y_Q5_0_AMPERE; const int nwarps = NWARPS_Q5_0_AMPERE; - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, + vec_dot_q5_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q5_0_PASCAL; @@ -4612,7 +4601,7 @@ template static void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_0_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4644,7 +4633,8 @@ template static void mul_mat_q5_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -4666,9 +4656,12 @@ mul_mat_q5_1( const int mmq_y = MMQ_Y_Q5_1_AMPERE; const int nwarps = NWARPS_Q5_1_AMPERE; - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, + vec_dot_q5_1_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q5_1_PASCAL; @@ -4680,7 +4673,7 @@ mul_mat_q5_1( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_1_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4712,7 +4705,8 @@ template static void mul_mat_q8_0( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -4734,9 +4728,12 @@ template static void const int mmq_y = MMQ_Y_Q8_0_AMPERE; const int nwarps = NWARPS_Q8_0_AMPERE; - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, + vec_dot_q8_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q8_0_PASCAL; @@ -4748,7 +4745,7 @@ template static void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q8_0_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4780,7 +4777,8 @@ template static void mul_mat_q2_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -4802,9 +4800,12 @@ mul_mat_q2_K( const int mmq_y = MMQ_Y_Q2_K_AMPERE; const int nwarps = NWARPS_Q2_K_AMPERE; - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, + vec_dot_q2_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q2_K_PASCAL; @@ -4816,7 +4817,7 @@ mul_mat_q2_K( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q2_K_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4845,12 +4846,13 @@ template static void __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) #endif // defined(RDNA3) || defined(RDNA2) #elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA - + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) #endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q3_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -4872,9 +4874,12 @@ template static void const int mmq_y = MMQ_Y_Q3_K_AMPERE; const int nwarps = NWARPS_Q3_K_AMPERE; - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, + vec_dot_q3_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q3_K_PASCAL; @@ -4886,7 +4891,7 @@ template static void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q3_K_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4915,12 +4920,13 @@ template static void __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) #endif // defined(RDNA3) || defined(RDNA2) #elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA - + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) #endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q4_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -4942,9 +4948,12 @@ template static void const int mmq_y = MMQ_Y_Q4_K_AMPERE; const int nwarps = NWARPS_Q4_K_AMPERE; - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, + vec_dot_q4_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q4_K_PASCAL; @@ -4956,7 +4965,7 @@ template static void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q4_K_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4988,7 +4997,8 @@ template static void mul_mat_q5_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -5010,9 +5020,12 @@ mul_mat_q5_K( const int mmq_y = MMQ_Y_Q5_K_AMPERE; const int nwarps = NWARPS_Q5_K_AMPERE; - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, + vec_dot_q5_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q5_K_PASCAL; @@ -5024,7 +5037,7 @@ mul_mat_q5_K( (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q5_K_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -5053,12 +5066,13 @@ template static void __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) #endif // defined(RDNA3) || defined(RDNA2) #elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA - + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) #endif // __CUDA_ARCH__ < CC_VOLTA mul_mat_q6_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) @@ -5080,9 +5094,12 @@ template static void const int mmq_y = MMQ_Y_Q6_K_AMPERE; const int nwarps = NWARPS_Q6_K_AMPERE; - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, + vec_dot_q6_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, + tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds); #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q6_K_PASCAL; @@ -5094,14 +5111,13 @@ template static void (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else (void) vec_dot_q6_K_q8_1_mul_mat; - bad_arch(stream_ct1); + bad_arch(); #endif // __CUDA_ARCH__ >= CC_VOLTA } template static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const sycl::stream &stream_ct1) { + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -5130,18 +5146,12 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs, stream_ct1); + tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); } // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:22: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -5209,12 +5219,6 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:23: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -5258,7 +5262,7 @@ static void mul_mat_p021_f16_f32( // x is transposed and permuted const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x; const float xi = - sycl::vec{x[ix]} + sycl::vec(x[ix]) .convert()[0]; const int row_y = col_x; @@ -5276,12 +5280,6 @@ static void mul_mat_p021_f16_f32( // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:24: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -5326,7 +5324,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const int iy = channel*nrows_y + row_y; const float xi = - sycl::vec{x[ix]} + sycl::vec(x[ix]) .convert()[0]; tmp += xi * y[iy]; @@ -5335,12 +5333,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous // sum up partial sums and write back result #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - /* - DPCT1023:25: The SYCL sub-group does not support mask options for - dpct::permute_sub_group_by_xor. You can specify - "--use-experimental-features=masked-sub-group-operation" to use the - experimental helper function to migrate __shfl_xor_sync. - */ tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -5361,7 +5353,7 @@ static void cpy_1_f32_f16(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; sycl::half *dsti = (sycl::half *)cdsti; - *dsti = sycl::vec{(*xi)} + *dsti = sycl::vec(*xi) .convert()[0]; } @@ -5729,7 +5721,7 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, dst_row[col] = col; } /* - DPCT1065:73: Consider replacing sycl::nd_item::barrier() with + DPCT1065:67: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -5750,11 +5742,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, } } /* - DPCT1118:26: SYCL group functions and algorithms must be encountered + DPCT1118:20: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:74: Consider replacing sycl::nd_item::barrier() with + DPCT1065:68: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -5806,11 +5798,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in buf[lane_id] = -INFINITY; } /* - DPCT1118:27: SYCL group functions and algorithms must be encountered in + DPCT1118:21: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:75: Consider replacing sycl::nd_item::barrier() with + DPCT1065:69: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -5820,11 +5812,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in buf[warp_id] = max_val; } /* - DPCT1118:28: SYCL group functions and algorithms must be encountered in + DPCT1118:22: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:76: Consider replacing sycl::nd_item::barrier() with + DPCT1065:70: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -5852,11 +5844,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in buf[lane_id] = 0.f; } /* - DPCT1118:29: SYCL group functions and algorithms must be encountered in + DPCT1118:23: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:77: Consider replacing sycl::nd_item::barrier() with + DPCT1065:71: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -5866,11 +5858,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in buf[warp_id] = tmp; } /* - DPCT1118:30: SYCL group functions and algorithms must be encountered in + DPCT1118:24: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ /* - DPCT1065:78: Consider replacing sycl::nd_item::barrier() with + DPCT1065:72: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory. */ @@ -5938,12 +5930,12 @@ static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta, if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { dst[offset_dst] = - sycl::vec{0.0f} + sycl::vec(0.0f) .convert()[0]; } else { const int64_t offset_src = item_ct1.get_group(0) * offset_delta; dst[offset_dst] = - sycl::vec{x[offset_src + iih * IW + iiw]} + sycl::vec(x[offset_src + iih * IW + iiw]) .convert()[0]; } } @@ -6009,6 +6001,7 @@ static void get_rows_cuda_float(const ggml_tensor *src0, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { @@ -6106,11 +6099,12 @@ struct bin_bcast_cuda { sycl::range<3> block_dims(1, 1, 1); block_dims[2] = std::min(hne0, block_size); - block_dims[1] = - std::min(ne1, block_size / block_dims[2]); + block_dims[1] = std::min( + ne1, block_size / (unsigned int)block_dims[2]); block_dims[0] = std::min( - std::min(ne2 * ne3, block_size / block_dims[2] / - block_dims[1]), + std::min( + ne2 * ne3, block_size / (unsigned int)block_dims[2] / + (unsigned int)block_dims[1]), 64U); sycl::range<3> block_nums( @@ -6124,6 +6118,7 @@ struct bin_bcast_cuda { { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size), @@ -6137,13 +6132,14 @@ struct bin_bcast_cuda { } } else { /* - DPCT1049:31: The work-group size passed to the SYCL kernel may + DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { @@ -6279,7 +6275,7 @@ static void norm_f32_cuda(const float *x, float *dst, const int ncols, } else { const sycl::range<3> block_dims(1, 1, 1024); /* - DPCT1049:32: The work-group size passed to the SYCL kernel may exceed + DPCT1049:26: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ @@ -6324,7 +6320,7 @@ static void group_norm_f32_cuda(const float *x, float *dst, } else { const sycl::range<3> block_dims(1, 1, 1024); /* - DPCT1049:33: The work-group size passed to the SYCL kernel may exceed + DPCT1049:27: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ @@ -6411,7 +6407,7 @@ static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols, } else { const sycl::range<3> block_dims(1, 1, 1024); /* - DPCT1049:34: The work-group size passed to the SYCL kernel may exceed + DPCT1049:28: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ @@ -6440,6 +6436,7 @@ static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(num_blocks * block_size, block_size), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -6456,6 +6453,7 @@ static void dequantize_block_cuda(const void *__restrict__ vx, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>( sycl::range<3>(1, 1, num_blocks) * @@ -6475,6 +6473,7 @@ static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), @@ -6495,6 +6494,7 @@ static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), @@ -6514,6 +6514,7 @@ static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), @@ -6531,6 +6532,7 @@ static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), @@ -6551,6 +6553,7 @@ static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), @@ -6633,6 +6636,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -6653,6 +6657,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -6673,6 +6678,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -6693,6 +6699,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -6713,6 +6720,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -6810,6 +6818,7 @@ static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -6827,17 +6836,13 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy, @@ -6848,17 +6853,13 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy, @@ -6869,17 +6870,13 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy, @@ -6890,17 +6887,13 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy, @@ -6911,17 +6904,13 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy, @@ -6932,17 +6921,13 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy, @@ -6953,17 +6938,13 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy, @@ -6974,17 +6955,13 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy, @@ -6995,17 +6972,13 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy, @@ -7016,17 +6989,13 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy, const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { - mul_mat_vec_q(vx, vy, dst, ncols, nrows, - item_ct1, stream_ct1); - }); - }); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); } static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy, @@ -7036,7 +7005,8 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7068,39 +7038,99 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:35: The work-group size passed to the SYCL kernel may exceed + DPCT1049:29: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_0(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:36: The work-group size passed to the SYCL kernel may exceed + DPCT1049:30: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_0(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7116,7 +7146,8 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7148,39 +7179,101 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:37: The work-group size passed to the SYCL kernel may exceed + DPCT1049:31: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + //zjy const int mmq_y = MMQ_Y_Q4_1_PASCAL; - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_1(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(mmq_y /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + mmq_y /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:38: The work-group size passed to the SYCL kernel may exceed + DPCT1049:32: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_1(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7196,7 +7289,8 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7228,39 +7322,99 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:39: The work-group size passed to the SYCL kernel may exceed + DPCT1049:33: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_0(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:40: The work-group size passed to the SYCL kernel may exceed + DPCT1049:34: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_0(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7276,7 +7430,8 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7308,39 +7463,99 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:41: The work-group size passed to the SYCL kernel may exceed + DPCT1049:35: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_1(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:42: The work-group size passed to the SYCL kernel may exceed + DPCT1049:36: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_1(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7356,7 +7571,8 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7388,39 +7604,99 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:43: The work-group size passed to the SYCL kernel may exceed + DPCT1049:37: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q8_0(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:44: The work-group size passed to the SYCL kernel may exceed + DPCT1049:38: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q8_0(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7436,7 +7712,8 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7468,39 +7745,99 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:45: The work-group size passed to the SYCL kernel may exceed + DPCT1049:39: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q2_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:46: The work-group size passed to the SYCL kernel may exceed + DPCT1049:40: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q2_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7518,7 +7855,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy, #if QK_K == 256 int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7550,39 +7888,99 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:47: The work-group size passed to the SYCL kernel may exceed + DPCT1049:41: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q3_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:48: The work-group size passed to the SYCL kernel may exceed + DPCT1049:42: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q3_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } #endif } @@ -7599,7 +7997,8 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7631,39 +8030,99 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:49: The work-group size passed to the SYCL kernel may exceed + DPCT1049:43: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:50: The work-group size passed to the SYCL kernel may exceed + DPCT1049:44: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q4_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7679,7 +8138,8 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7711,39 +8171,99 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:51: The work-group size passed to the SYCL kernel may exceed + DPCT1049:45: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:52: The work-group size passed to the SYCL kernel may exceed + DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q5_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7759,7 +8279,8 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy, dpct::queue_ptr stream) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -7791,39 +8312,99 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy, if (nrows_x % mmq_y == 0) { const bool need_check = false; /* - DPCT1049:53: The work-group size passed to the SYCL kernel may exceed + DPCT1049:47: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q6_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } else { const bool need_check = true; /* - DPCT1049:54: The work-group size passed to the SYCL kernel may exceed + DPCT1049:48: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ - stream->submit([&](sycl::handler &cgh) { - sycl::stream stream_ct1(64 * 1024, 80, cgh); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - mul_mat_q6_K(vx, vy, dst, ncols_x, nrows_x, - ncols_y, nrows_y, nrows_dst, - stream_ct1); - }); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (2 * WARP_SIZE) + + dpct_placeholder /*Fix the type mannually*/), + cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / QI6_K) + + dpct_placeholder /*Fix the type mannually*/ / + QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ * + (WARP_SIZE / 8) + + dpct_placeholder /*Fix the type mannually*/ / + 8), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } } } catch (sycl::exception const &exc) { @@ -7844,6 +8425,7 @@ static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -7863,6 +8445,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda( { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { @@ -7885,6 +8468,7 @@ static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), @@ -7909,6 +8493,7 @@ static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), @@ -7990,6 +8575,7 @@ static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), @@ -8038,12 +8624,13 @@ static void rope_cuda(const T *x, T *dst, int ncols, int nrows, const sycl::range<3> block_nums(1, num_blocks_x, nrows); if (pos == nullptr) { /* - DPCT1049:55: The work-group size passed to the SYCL kernel may exceed + DPCT1049:49: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { @@ -8053,12 +8640,13 @@ static void rope_cuda(const T *x, T *dst, int ncols, int nrows, }); } else { /* - DPCT1049:56: The work-group size passed to the SYCL kernel may exceed + DPCT1049:50: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { @@ -8085,12 +8673,13 @@ static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows, if (pos == nullptr) { /* - DPCT1049:57: The work-group size passed to the SYCL kernel may exceed + DPCT1049:51: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { @@ -8101,12 +8690,13 @@ static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows, }); } else { /* - DPCT1049:58: The work-group size passed to the SYCL kernel may exceed + DPCT1049:52: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { @@ -8168,7 +8758,7 @@ static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols, const sycl::range<3> block_nums(1, nrows, 1); if (order == GGML_SORT_ASC) { /* - DPCT1049:59: The work-group size passed to the SYCL kernel may exceed + DPCT1049:53: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ @@ -8179,7 +8769,7 @@ static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols, }); } else if (order == GGML_SORT_DESC) { /* - DPCT1049:60: The work-group size passed to the SYCL kernel may exceed + DPCT1049:54: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ @@ -8217,13 +8807,13 @@ static void soft_max_f32_cuda(const float *x, const float *y, float *dst, const sycl::range<3> block_dims(1, 1, nth); const sycl::range<3> block_nums(1, 1, nrows_x); /* - DPCT1049:61: The work-group size passed to the SYCL kernel may exceed the + DPCT1049:55: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ stream->submit([&](sycl::handler &cgh) { /* - DPCT1101:111: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was + DPCT1101:105: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was replaced with a value. Modify the code to use the original expression, provided in comments, if it is correct. */ @@ -8250,6 +8840,7 @@ static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH, { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); + stream->parallel_for( sycl::nd_range<3>(block_nums * sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE), @@ -8293,7 +8884,8 @@ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0}; static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try { scoped_spin_lock lock(g_cuda_pool_lock); int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); #ifdef DEBUG_CUDA_MALLOC int nnz = 0; size_t max_size = 0; @@ -8354,7 +8946,8 @@ catch (sycl::exception const &exc) { static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try { scoped_spin_lock lock(g_cuda_pool_lock); int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { cuda_buffer& b = g_cuda_buffer_pool[id][i]; @@ -8377,10 +8970,10 @@ catch (sycl::exception const &exc) { #if !defined(GGML_USE_HIPBLAS) // pool with virtual memory /* -DPCT1082:79: Migration of CUmemGenericAllocationHandle type is not supported. +DPCT1082:73: Migration of CUmemGenericAllocationHandle type is not supported. */ -// static std::vector -// g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES]; +static std::vector + g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES]; static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0}; static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0}; static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB @@ -8388,7 +8981,8 @@ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try { scoped_spin_lock lock(g_cuda_pool_lock); int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types const size_t alignment = 128; @@ -8406,54 +9000,54 @@ static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try { // allocate more physical memory /* - DPCT1082:80: Migration of CUmemAllocationProp type is not supported. + DPCT1082:74: Migration of CUmemAllocationProp type is not supported. */ CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = id; /* - DPCT1082:81: Migration of CUmemGenericAllocationHandle type is not + DPCT1082:75: Migration of CUmemGenericAllocationHandle type is not supported. */ - // CUmemGenericAllocationHandle handle; + CUmemGenericAllocationHandle handle; /* - DPCT1007:84: Migration of cuMemCreate is not supported. + DPCT1007:78: Migration of cuMemCreate is not supported. */ - // CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0)); + CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0)); // reserve virtual address space (if not already reserved) if (g_cuda_pool_addr[id] == 0) { /* - DPCT1007:85: Migration of cuMemAddressReserve is not supported. + DPCT1007:79: Migration of cuMemAddressReserve is not supported. */ - // CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], - // CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); + CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], + CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); } // map at the end of the pool /* - DPCT1007:86: Migration of cuMemMap is not supported. + DPCT1007:80: Migration of cuMemMap is not supported. */ - // CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], - // reserve_size, 0, handle, 0)); + CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], + reserve_size, 0, handle, 0)); // set access /* - DPCT1082:87: Migration of CUmemAccessDesc type is not supported. + DPCT1082:81: Migration of CUmemAccessDesc type is not supported. */ CUmemAccessDesc access = {}; access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; access.location.id = id; access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; /* - DPCT1007:88: Migration of cuMemSetAccess is not supported. + DPCT1007:82: Migration of cuMemSetAccess is not supported. */ CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], reserve_size, &access, 1)); // add to the pool - // g_cuda_pool_handles[id].push_back(handle); + g_cuda_pool_handles[id].push_back(handle); g_cuda_pool_size[id] += reserve_size; //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n", @@ -8482,7 +9076,8 @@ catch (sycl::exception const &exc) { static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try { scoped_spin_lock lock(g_cuda_pool_lock); int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); #ifdef DEBUG_CUDA_MALLOC printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr); @@ -8501,8 +9096,8 @@ catch (sycl::exception const &exc) { static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try { int id; - - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); if (g_device_caps[id].vmm) { return ggml_cuda_pool_malloc_vmm(size, actual_size); } else { @@ -8517,7 +9112,8 @@ catch (sycl::exception const &exc) { static void ggml_cuda_pool_free(void *ptr, size_t size) try { int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); if (g_device_caps[id].vmm) { ggml_cuda_pool_free_vmm(ptr, size); } else { @@ -8573,43 +9169,37 @@ bool ggml_cublas_loaded(void) { return g_cublas_loaded; } -void print_devices(int device_count){ +void print_devices(){ + int device_count = dpct::dev_mgr::instance().device_count() for (int id = 0; id < device_count; ++id) { dpct::device_info prop; CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( prop, dpct::dev_mgr::instance().get_device(id)))); - fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.get_name(), prop.get_major_version(), prop.get_minor_version()); } } -int get_env_value(const char *env_name, int default_val){ - char * user_device_string = getenv(env_name); - int user_device_number = -1; - - unsigned n; - if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) { - user_device_number = (int)n; - } else { - user_device_number=default_val; - } -} void ggml_init_cublas() try { static bool initialized = false; if (!initialized) { + print_devices(); -#ifdef __HIP_PLATFORM_AMD__ - // Workaround for a rocBLAS bug when using multiple graphics cards: - // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 - rocblas_initialize(); - CUDA_CHECK(cudaDeviceSynchronize()); -#endif + char * user_device_string = getenv("GGML_SYCL_DEVICE"); + int user_device_number = -1; - g_device_count = dpct::dev_mgr::instance().device_count(); - if (DPCT_CHECK_ERROR(g_device_count != 0)) { + unsigned n; + if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) { + user_device_number = (int)n; + } else { + user_device_number=0; + } + + if (DPCT_CHECK_ERROR(g_device_count = + dpct::dev_mgr::instance().device_count()) != + 0) { initialized = true; g_cublas_loaded = false; return; @@ -8628,55 +9218,19 @@ void ggml_init_cublas() try { fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__); #endif fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); - print_devices(g_device_count); - //zjy hardcode, force set to 1 device + //zjy hardcode, force set to 1 device g_device_count = 1; - for (int id = 0; id < g_device_count; ++id) { int device_vmm = 0; -#if !defined(GGML_USE_HIPBLAS) - //int device; - //CU_CHECK(DPCT_CHECK_ERROR(device = id)); - /* - DPCT1028:89: The cuDeviceGetAttribute was not migrated because - parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is - unsupported. - */ - /*CU_CHECK(cuDeviceGetAttribute( - &device_vmm, - CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, - device)); - */ - //if (device_vmm) { - /* - DPCT1082:90: Migration of CUmemAllocationProp type is not - supported. - */ - //CUmemAllocationProp alloc_prop = {}; - //alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - //alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - //alloc_prop.location.id = id; - /* - DPCT1007:91: Migration of cuMemGetAllocationGranularity is not - supported. - */ - //CU_CHECK(cuMemGetAllocationGranularity( - // &g_device_caps[id].vmm_granularity, &alloc_prop, - // CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - //} -#endif // !defined(GGML_USE_HIPBLAS) g_device_caps[id].vmm = !!device_vmm; dpct::device_info prop; - dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(id))ï¼› - - // CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( - // prop, dpct::dev_mgr::instance().get_device(id)))); + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(id)))); /* - DPCT1005:92: The SYCL device version is different from CUDA Compute + DPCT1005:86: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code. */ fprintf(stderr, @@ -8690,55 +9244,45 @@ void ggml_init_cublas() try { g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; #else /* - DPCT1005:93: The SYCL device version is different from CUDA Compute + DPCT1005:87: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code. */ g_device_caps[id].cc = 100 * prop.get_major_version() + 10 * prop.get_minor_version(); #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } - - int user_device_number = get_env_value("GGML_SYCL_DEVICE", 0); - for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; } for (int id = 0; id < g_device_count; ++id) { - ggml_cuda_set_device(id)ï¼› - // CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(ggml_cuda_set_device(user_device_number)); // create cuda streams for (int is = 0; is < MAX_STREAMS; ++is) { /* - DPCT1025:105: The SYCL queue is created ignoring the flag and + DPCT1025:88: The SYCL queue is created ignoring the flag and priority options. */ - g_cudaStreams[id][is] = - dpct::get_current_device().create_queue()ï¼› - // CUDA_CHECK(DPCT_CHECK_ERROR( - // g_cudaStreams[id][is] = - // dpct::get_current_device().create_queue())); + CUDA_CHECK(DPCT_CHECK_ERROR( + g_cudaStreams[id][is] = + dpct::get_current_device().create_queue())); } // create cublas handle - g_cublas_handles[id] = &dpct::get_in_order_queue(); - // CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = - // &dpct::get_in_order_queue())); + CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = + &dpct::get_in_order_queue())); /* - DPCT1027:107: The call to cublasSetMathMode was replaced with 0 - because this call is redundant in SYCL. + DPCT1027:89: The call to cublasSetMathMode was replaced with 0 + because this functionality is redundant in SYCL. */ CUBLAS_CHECK(0); } // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); - - ggml_cuda_set_device(user_device_number); fprintf(stderr, " set Device %d\n", user_device_number); - initialized = true; g_cublas_loaded = true; } @@ -8782,22 +9326,22 @@ void *ggml_cuda_host_malloc(size_t size) try { dpct::err0 err = DPCT_CHECK_ERROR( ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue())); /* - DPCT1000:97: Error handling if-stmt was detected but could not be rewritten. + DPCT1000:91: Error handling if-stmt was detected but could not be rewritten. */ if (err != 0) { // clear the error /* - DPCT1026:98: The call to cudaGetLastError was removed because this call - is redundant in SYCL. + DPCT1026:92: The call to cudaGetLastError was removed because this + functionality is redundant in SYCL. */ /* - DPCT1001:96: The statement could not be removed. + DPCT1001:90: The statement could not be removed. */ fprintf( stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", /* - DPCT1009:99: SYCL uses exceptions to report errors and does not use + DPCT1009:93: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code. */ @@ -8839,7 +9383,8 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst, kind = dpct::device_to_device; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; - CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK(DPCT_CHECK_ERROR( + id = dpct::dev_mgr::instance().current_device_id())); src_ptr = (char *) extra->data_device[id]; } else { GGML_ASSERT(false); @@ -8871,10 +9416,10 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst, dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy( rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream)); /* - DPCT1001:100: The statement could not be removed. + DPCT1001:94: The statement could not be removed. */ /* - DPCT1000:101: Error handling if-stmt was detected but could not be + DPCT1000:95: Error handling if-stmt was detected but could not be rewritten. */ if (r != 0) return r; @@ -9256,8 +9801,8 @@ inline void ggml_cuda_op_mul_mat_q( const int64_t row_diff = row_high - row_low; int id; - id = dpct::dev_mgr::instance().current_device_id(); - // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into @@ -9520,8 +10065,8 @@ inline void ggml_cuda_op_mul_mat_cublas( const int64_t row_diff = row_high - row_low; int id; - id = dpct::dev_mgr::instance().current_device_id(); - // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id()); + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into @@ -9561,7 +10106,7 @@ inline void ggml_cuda_op_mul_mat_cublas( CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)); CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm( - g_cublas_handles, oneapi::mkl::transpose::trans, + *g_cublas_handles[id], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, @@ -9588,8 +10133,10 @@ inline void ggml_cuda_op_mul_mat_cublas( CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)); CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm( *g_cublas_handles[id], oneapi::mkl::transpose::trans, - oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, alpha, - src0_ddf_i, ne00, src1_ddf_i, ne10, beta, dst_dd_i, ldc))); + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, + dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00, + src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]), + dst_dd_i, ldc))); } (void) dst; @@ -9850,7 +10397,7 @@ inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1, scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); /* - DPCT1010:102: SYCL uses exceptions to report errors and does not use the + DPCT1010:96: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ CUDA_CHECK(0); @@ -9875,7 +10422,7 @@ inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1, clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); /* - DPCT1010:103: SYCL uses exceptions to report errors and does not use the + DPCT1010:97: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ CUDA_CHECK(0); @@ -9940,7 +10487,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0, // do the computation op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); /* - DPCT1010:104: SYCL uses exceptions to report errors and does not use the + DPCT1010:98: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ CUDA_CHECK(0); @@ -10131,7 +10678,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0, if (src1_on_device && src1_is_contiguous) { quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); /* - DPCT1010:105: SYCL uses exceptions to report errors and does not + DPCT1010:99: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ @@ -10152,7 +10699,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0, if (split && used_devices > 1) { CUDA_CHECK(ggml_cuda_set_device(g_main_device)); /* - DPCT1024:106: The original code returned the error code that was further + DPCT1024:100: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code. */ @@ -10229,7 +10776,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0, if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); /* - DPCT1010:107: SYCL uses exceptions to report errors and does + DPCT1010:101: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ @@ -10244,7 +10791,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0, op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream); /* - DPCT1010:108: SYCL uses exceptions to report errors and does not + DPCT1010:102: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ @@ -10289,7 +10836,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0, // add event for the main device to wait on until other device is done if (split && (id != g_main_device || is != 0)) { /* - DPCT1024:109: The original code returned the error code that + DPCT1024:103: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code. @@ -10666,7 +11213,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0, // there is no broadcast and src0, src1 are contiguous across dims 2, 3 // use cublasGemmStridedBatchedEx CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch( - g_cublas_handles, oneapi::mkl::transpose::trans, + *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const char *)src0_as_f16, dpct::library_data_t::real_half, nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half), @@ -10683,13 +11230,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0, sycl::range<3> block_dims(1, ne12, ne13); /* - DPCT1049:62: The work-group size passed to the SYCL kernel may exceed + DPCT1049:56: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed. */ { dpct::has_capability_or_fail(main_stream->get_device(), {sycl::aspect::fp16}); + main_stream->submit([&](sycl::handler &cgh) { const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get(); const void **ptrs_src_get_ct3 = ptrs_src.get(); @@ -10707,14 +11255,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0, }); } /* - DPCT1010:110: SYCL uses exceptions to report errors and does not use the + DPCT1010:104: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ CUDA_CHECK(0); CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch( - g_cublas_handles, oneapi::mkl::transpose::trans, + *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const void **)(ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / sizeof(sycl::half), From 0c00b4f654268491f82221bc7beb46990f44cd67 Mon Sep 17 00:00:00 2001 From: jianyuzh Date: Fri, 29 Dec 2023 14:58:07 +0800 Subject: [PATCH 03/90] add debug functio, commit all help code --- README_sycl.md | 163 + dpcpp_out2/MainSourceFiles.yaml | 18145 +++++++++++++++++++++++++++ dpcpp_out2/ggml-alloc.h | 92 + dpcpp_out2/ggml-backend-impl.h | 116 + dpcpp_out2/ggml-backend.h | 188 + dpcpp_out2/ggml-cuda.dp.cpp | 12724 +++++++++++++++++++ dpcpp_out2/ggml-cuda.h | 64 + dpcpp_out2/ggml.h | 2253 ++++ dpcpp_out2/ggml.h.yaml | 100 + dpct/atomic.hpp | 842 ++ dpct/blas_utils.hpp | 1792 +++ dpct/ccl_utils.hpp | 286 + dpct/device.hpp | 781 ++ dpct/dnnl_utils.hpp | 4921 ++++++++ dpct/dpct.hpp | 62 + dpct/dpl_extras/algorithm.h | 2419 ++++ dpct/dpl_extras/dpcpp_extensions.h | 747 ++ dpct/dpl_extras/functional.h | 453 + dpct/dpl_extras/iterators.h | 347 + dpct/dpl_extras/memory.h | 1024 ++ dpct/dpl_extras/numeric.h | 32 + dpct/dpl_extras/vector.h | 752 ++ dpct/dpl_utils.hpp | 26 + dpct/fft_utils.hpp | 1376 ++ dpct/image.hpp | 901 ++ dpct/kernel.hpp | 459 + dpct/lapack_utils.hpp | 1953 +++ dpct/lib_common_utils.hpp | 174 + dpct/math.hpp | 1814 +++ dpct/memory.hpp | 1497 +++ dpct/rng_utils.hpp | 535 + dpct/sparse_utils.hpp | 1385 ++ dpct/util.hpp | 1070 ++ ggml-sycl.cpp | 1240 +- ggml-sycl.cpp.base | 11951 ++++++++++++++++++ migrate.sh | 18 + run.sh | 12 + setup.sh | 7 + 38 files changed, 71956 insertions(+), 765 deletions(-) create mode 100644 README_sycl.md create mode 100644 dpcpp_out2/MainSourceFiles.yaml create mode 100644 dpcpp_out2/ggml-alloc.h create mode 100644 dpcpp_out2/ggml-backend-impl.h create mode 100644 dpcpp_out2/ggml-backend.h create mode 100644 dpcpp_out2/ggml-cuda.dp.cpp create mode 100644 dpcpp_out2/ggml-cuda.h create mode 100644 dpcpp_out2/ggml.h create mode 100644 dpcpp_out2/ggml.h.yaml create mode 100644 dpct/atomic.hpp create mode 100644 dpct/blas_utils.hpp create mode 100644 dpct/ccl_utils.hpp create mode 100644 dpct/device.hpp create mode 100644 dpct/dnnl_utils.hpp create mode 100644 dpct/dpct.hpp create mode 100644 dpct/dpl_extras/algorithm.h create mode 100644 dpct/dpl_extras/dpcpp_extensions.h create mode 100644 dpct/dpl_extras/functional.h create mode 100644 dpct/dpl_extras/iterators.h create mode 100644 dpct/dpl_extras/memory.h create mode 100644 dpct/dpl_extras/numeric.h create mode 100644 dpct/dpl_extras/vector.h create mode 100644 dpct/dpl_utils.hpp create mode 100644 dpct/fft_utils.hpp create mode 100644 dpct/image.hpp create mode 100644 dpct/kernel.hpp create mode 100644 dpct/lapack_utils.hpp create mode 100644 dpct/lib_common_utils.hpp create mode 100644 dpct/math.hpp create mode 100644 dpct/memory.hpp create mode 100644 dpct/rng_utils.hpp create mode 100644 dpct/sparse_utils.hpp create mode 100644 dpct/util.hpp create mode 100644 ggml-sycl.cpp.base create mode 100755 migrate.sh create mode 100755 run.sh create mode 100755 setup.sh diff --git a/README_sycl.md b/README_sycl.md new file mode 100644 index 0000000000000..e76b9bbb4b85b --- /dev/null +++ b/README_sycl.md @@ -0,0 +1,163 @@ +# llama.cpp for SYCL + +## Background + +SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17. + +oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms. + +Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs. + +This project is migrated the CUDA code to SYCL to support Intel CPU, GPU and FPGA. + +But we focus on GPU performance tuning. If you want to run llama.cpp on Intel CPU, please use llama.cpp CPU release. + +## llama.cpp for SYCL + +We migrate the CUDA code SYCL. So the SYCL code replace the CUDA funcitions in llama.cpp, without function name change. + +That's why the code macro and log incudes CUBLAS flags. + +## OS + +### Linux + +In Linux, we reuse the CMAKE system of base. It's same as base llama.cpp. + +Except branch "windows", other branches are for Linux. + +### Windows + +In Windows, we change the C source files to meet the requirement of C++ compilers. + +So the code is saved in branch **windows** only. + +It will output 1 execute file: **llamap.cpp.sycl.exe**. + +If you want to get more binary files, please change the build prject. + + +## Linux + +### Setup Environment + +1. Install Intel oneAPI Base toolkit. + +2. Setup Local + +``` +./setup.sh +``` + +### Run + +#### Check device id + +Run without parameter: + +``` +./build/bin/main +``` + +Check the id in startup log, like: +ggml_init_cublas: found 6 CUDA devices: + Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3 + Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2 + Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0 + Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0 + Device 4: Intel(R) UHD Graphics 770, compute capability 3.0 + Device 5: Intel(R) UHD Graphics 770, compute capability 1.3 + +#### Put model file to folder **models** + +#### Modify run.sh + +Up run.sh as above info: +``` +... +GGML_SYCL_DEVICE=0 +./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 +``` + +#### Run +``` +./run.sh +``` + + +## Windows + +### Setup Environment + +1. Install MS Visual Studio 2022. + +2. Install Intel oneAPI Base toolkit. + +a. Recommend to install all components and with **default path**. + +b. During installation, please choose option to enable compiler in MS Visual Studio. + +3. Code + +Swith to branch **windows**. + +Open **llama.cpp.sycl.sln** by Visual Studio 2022. + +4. Set oneAPI Path (optional) + +If you chagne the oneAPI installation target path, please modify the oneAPI path in the Visual Studio. + +Else, skip this step. + +### Build + +Build by visual Studio 2022 with x64 & Release. + +There will be execute file: **llama.cpp.sycl.exe**. + +It will take long time to build due to enable AOT on all hardware flatforms (CPU, GPU, FPGA) as default. + +To short it, change AOT target flatforms to one in Visual Studio 2022: **Specify SYCL offloading targets for AOT compilition**. + +#### Run + +#### Enable oneAPI Environment + +Run the command in command line or powershell. + +'C:\Program Files (x86)\Intel\oneAPI\setvars.bat' + +##### Check device id + + +Run without parameter: + +``` +.\x64\Release\llama.cpp.sycl.exe +``` + +Check the id in startup log, like: +ggml_init_cublas: found 6 CUDA devices: + Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3 + Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2 + Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0 + Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0 + Device 4: Intel(R) UHD Graphics 770, compute capability 3.0 + Device 5: Intel(R) UHD Graphics 770, compute capability 1.3 + +#### Put model file to folder **models** + +#### Modify run.sh + +Up run.sh as above info: +``` +... +set GGML_SYCL_DEVICE=0 + +.\x64\Release\llama.cpp.sycl.exe -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 +``` + +#### Run +``` +.\run.bat +``` \ No newline at end of file diff --git a/dpcpp_out2/MainSourceFiles.yaml b/dpcpp_out2/MainSourceFiles.yaml new file mode 100644 index 0000000000000..472f76ce1182e --- /dev/null +++ b/dpcpp_out2/MainSourceFiles.yaml @@ -0,0 +1,18145 @@ +--- +MainSourceFile: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/MainSrcFiles_placehold' +Replacements: + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 0 + Length: 0 + ReplacementText: "#define DPCT_PROFILING_ENABLED\n#define DPCT_COMPAT_RT_VERSION 12010\n#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211 + Length: 0 + ReplacementText: "\n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 4081 + Length: 26 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 4107 + Length: 18 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 4125 + Length: 23 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 4148 + Length: 23 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 4176 + Length: 14 + ReplacementText: DPCT_COMPAT_RT_VERSION + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 4517 + Length: 0 + ReplacementText: "\n#include \n\n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 8376 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 8431 + Length: 14 + ReplacementText: DPCT_COMPAT_RT_VERSION + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 8506 + Length: 14 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 8528 + Length: 0 + ReplacementText: " /*\n DPCT1009:48: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 8543 + Length: 26 + ReplacementText: '"cublasGetStatusString is not supported"/*cublasGetStatusString(err)*/' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 9824 + Length: 0 + ReplacementText: "/*\nDPCT1001:50: The statement could not be removed.\n*/\n/*\nDPCT1000:51: Error handling if-stmt was detected but could not be rewritten.\n*/\n/*\nDPCT1009:52: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n*/\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 9886 + Length: 11 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 9961 + Length: 24 + ReplacementText: '"cudaGetErrorString is not supported"/*cudaGetErrorString(err_)*/' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10064 + Length: 21 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10249 + Length: 8 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10291 + Length: 0 + ReplacementText: " /*\n DPCT1007:49: Migration of cuGetErrorString is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10350 + Length: 0 + ReplacementText: "/*\nDPCT1001:67: The statement could not be removed.\n*/\n/*\nDPCT1000:68: Error handling if-stmt was detected but could not be rewritten.\n*/\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10412 + Length: 12 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10540 + Length: 14 + ReplacementText: DPCT_COMPAT_RT_VERSION + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10822 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10869 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 10880 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 11159 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 11170 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 11451 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 11462 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 11646 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 11657 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 11942 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 12027 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 12774 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 12985 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 13253 + Length: 0 + ReplacementText: ' dpct_type_471834' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 13260 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 13542 + Length: 0 + ReplacementText: ' dpct_type_143705' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 13549 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 13854 + Length: 0 + ReplacementText: ' dpct_type_673649' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 13861 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 14212 + Length: 0 + ReplacementText: ' dpct_type_135589' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 14219 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 14593 + Length: 0 + ReplacementText: ' dpct_type_122878' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 14600 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 14868 + Length: 0 + ReplacementText: ' dpct_type_143721' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 14875 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 15254 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 15390 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 15640 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 15770 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 16066 + Length: 0 + ReplacementText: ' dpct_type_619598' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 16183 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 16445 + Length: 0 + ReplacementText: ' dpct_type_138576' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 16713 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 17280 + Length: 0 + ReplacementText: ' dpct_type_154943' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 17287 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 18042 + Length: 0 + ReplacementText: ' dpct_type_866817' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 18049 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 18523 + Length: 0 + ReplacementText: ' dpct_type_107281' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 18669 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20185 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20254 + Length: 7 + ReplacementText: '&dpct::get_in_order_queue()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20397 + Length: 11 + ReplacementText: 'dpct::event_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20636 + Length: 11 + ReplacementText: 'dpct::err0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20687 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20728 + Length: 30 + ReplacementText: 'DPCT_CHECK_ERROR(current_device = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20813 + Length: 11 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20833 + Length: 0 + ReplacementText: " /*\n DPCT1093:53: The \"device\" device may be not the one intended for use. Adjust the selected device if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20844 + Length: 13 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::select_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20865 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 20868 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21441 + Length: 14 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21530 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21555 + Length: 0 + ReplacementText: 'const sycl::stream &stream_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21563 + Length: 91 + ReplacementText: 'stream_ct1 << "ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n"' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21738 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21749 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21794 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21861 + Length: 0 + ReplacementText: " /*\n DPCT1096:98: The right-most dimension of the work-group used in the SYCL kernel that calls this function may be less than \"32\". The function \"dpct::permute_sub_group_by_xor\" may return an unexpected result on the CPU device. Modify the size of the work-group to ensure that the value of the right-most dimension is a multiple of \"32\".\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21874 + Length: 40 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21946 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21957 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21973 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 21996 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22004 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22079 + Length: 3 + ReplacementText: 'a.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22086 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22138 + Length: 3 + ReplacementText: 'a.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22145 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22219 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22230 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22275 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22342 + Length: 0 + ReplacementText: " /*\n DPCT1096:97: The right-most dimension of the work-group used in the SYCL kernel that calls this function may be less than \"32\". The function \"dpct::permute_sub_group_by_xor\" may return an unexpected result on the CPU device. Modify the size of the work-group to ensure that the value of the right-most dimension is a multiple of \"32\".\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22354 + Length: 50 + ReplacementText: 'sycl::fmax(x, dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22436 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22447 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22535 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22546 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22635 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22646 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22735 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22746 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 22937 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23205 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23229 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23240 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23253 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23286 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23297 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23310 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23344 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23355 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23368 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23408 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23419 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23432 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23935 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 23946 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 24206 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 24482 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 24505 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 24516 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 24529 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 25255 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 25436 + Length: 0 + ReplacementText: ', const sycl::nd_item<3> &item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 25458 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 25471 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 25484 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 25849 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 25915 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26051 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26062 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26075 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26180 + Length: 51 + ReplacementText: 'sycl::tanh(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26244 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26310 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26332 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26343 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26356 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26438 + Length: 11 + ReplacementText: 'sycl::native::exp(-x[i])' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26462 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26526 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26592 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26603 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26616 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26705 + Length: 28 + ReplacementText: 'sycl::native::exp(GELU_QUICK_COEF * x[i])' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26747 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26805 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26828 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26839 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26852 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26918 + Length: 11 + ReplacementText: 'sycl::tanh((float)(x[i]))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 26941 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27007 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27029 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27040 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27053 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27120 + Length: 14 + ReplacementText: 'sycl::fmax((float)(x[i]), (float)0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27146 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27244 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27267 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27278 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27291 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27357 + Length: 14 + ReplacementText: 'sycl::fmax((float)(x[i]), (float)0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27374 + Length: 17 + ReplacementText: 'sycl::fmin((float)(x[i]), 0.0f)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27420 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27485 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27507 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27518 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27531 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27647 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27734 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27758 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27769 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27782 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27815 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27833 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27851 + Length: 21 + ReplacementText: 'sycl::float2(0.f, 0.f)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 27986 + Length: 10 + ReplacementText: 'mean_var.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28012 + Length: 10 + ReplacementText: 'mean_var.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28108 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28153 + Length: 28 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28204 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28251 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28353 + Length: 0 + ReplacementText: " /*\n DPCT1118:0: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28361 + Length: 15 + ReplacementText: 'item_ct1.barrier(sycl::access::fence_space::local_space)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28456 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28489 + Length: 10 + ReplacementText: 'mean_var.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28531 + Length: 10 + ReplacementText: 'mean_var.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28591 + Length: 17 + ReplacementText: 'sycl::rsqrt(var + eps)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28755 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28856 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28875 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28889 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 28902 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29020 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29047 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29066 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29085 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29170 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29201 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29220 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29345 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29377 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29405 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29477 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29588 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29642 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29656 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29669 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29792 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29883 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29946 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29973 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 29992 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30050 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30164 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30183 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30197 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30210 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30329 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30356 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30375 + Length: 9 + ReplacementText: 'item_ct1.get_group_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30409 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30430 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30507 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30539 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30697 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30818 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, float *s_sum" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30838 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 30911 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31155 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31200 + Length: 27 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31250 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31297 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31394 + Length: 0 + ReplacementText: " /*\n DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31402 + Length: 15 + ReplacementText: "/*\n DPCT1065:54: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31482 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31709 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31754 + Length: 27 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31804 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31851 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31948 + Length: 0 + ReplacementText: " /*\n DPCT1118:2: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 31956 + Length: 15 + ReplacementText: "/*\n DPCT1065:55: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32036 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32103 + Length: 22 + ReplacementText: 'sycl::rsqrt(variance + eps)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32246 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32337 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, float *s_sum" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32361 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32372 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32385 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32418 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32679 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32724 + Length: 27 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32774 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32821 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32918 + Length: 0 + ReplacementText: " /*\n DPCT1118:3: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 32926 + Length: 15 + ReplacementText: 'item_ct1.barrier(sycl::access::fence_space::local_space)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33006 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33076 + Length: 18 + ReplacementText: 'sycl::rsqrt(mean + eps)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33230 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33241 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33462 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33483 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33593 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33600 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33621 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33628 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33679 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33690 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33861 + Length: 20 + ReplacementText: 'x[ib].dm[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33904 + Length: 21 + ReplacementText: 'x[ib].dm[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33968 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 33989 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34093 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34100 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34118 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34125 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34173 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34184 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34537 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34579 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34713 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34720 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34742 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34749 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34801 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34812 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 34983 + Length: 20 + ReplacementText: 'x[ib].dm[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35026 + Length: 21 + ReplacementText: 'x[ib].dm[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35222 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35264 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35390 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35397 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35415 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35422 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35470 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35481 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35666 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35695 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35780 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35794 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 35910 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 36001 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 36026 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 36111 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 36329 + Length: 19 + ReplacementText: 'x[i].dm[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 36367 + Length: 20 + ReplacementText: 'x[i].dm[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 37268 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 37359 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 37382 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 37481 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 37577 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 39257 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 39566 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 39657 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 39732 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 39806 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 39991 + Length: 19 + ReplacementText: 'x[i].dm[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 40035 + Length: 20 + ReplacementText: 'x[i].dm[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 40869 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 40960 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 41035 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 41159 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 41386 + Length: 19 + ReplacementText: 'x[i].dm[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 41430 + Length: 20 + ReplacementText: 'x[i].dm[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 42569 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 42660 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 42735 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 42859 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 44069 + Length: 0 + ReplacementText: "/*\nDPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 44076 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 44233 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 44360 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 44371 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 44384 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 44669 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 44746 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 45550 + Length: 19 + ReplacementText: 'x[i].dm[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 45598 + Length: 20 + ReplacementText: 'x[i].dm[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 47936 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 47995 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 48048 + Length: 0 + ReplacementText: "/*\nDPCT1110:5: The total declared local variable size in device function dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 48055 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 48212 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 48237 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 48248 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 48261 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 48620 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 48697 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 52351 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 52410 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 52463 + Length: 0 + ReplacementText: "/*\nDPCT1110:6: The total declared local variable size in device function dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 52470 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 52627 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 52652 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 52663 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 52676 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 53015 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 53092 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 54147 + Length: 19 + ReplacementText: 'x[i].dm[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 54195 + Length: 20 + ReplacementText: 'x[i].dm[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 54792 + Length: 6 + ReplacementText: 'sycl::float4' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 54899 + Length: 3 + ReplacementText: 's.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 54923 + Length: 3 + ReplacementText: 's.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 54963 + Length: 3 + ReplacementText: 's.z()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 54987 + Length: 3 + ReplacementText: 's.w()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 55137 + Length: 3 + ReplacementText: 's.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 55151 + Length: 3 + ReplacementText: 's.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 55176 + Length: 3 + ReplacementText: 's.z()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 55190 + Length: 3 + ReplacementText: 's.w()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 57141 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 57245 + Length: 0 + ReplacementText: "/*\nDPCT1110:7: The total declared local variable size in device function dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 57252 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 57398 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 57423 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 57786 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 57832 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 58662 + Length: 19 + ReplacementText: 'x[i].dm[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 58710 + Length: 20 + ReplacementText: 'x[i].dm[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 59016 + Length: 6 + ReplacementText: 'sycl::float4' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 59509 + Length: 5 + ReplacementText: 'sum.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 59666 + Length: 5 + ReplacementText: 'sum.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 59823 + Length: 5 + ReplacementText: 'sum.z()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 59980 + Length: 5 + ReplacementText: 'sum.w()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 60317 + Length: 5 + ReplacementText: 'sum.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 60333 + Length: 5 + ReplacementText: 'sum.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 60349 + Length: 5 + ReplacementText: 'sum.z()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 60365 + Length: 5 + ReplacementText: 'sum.w()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 61535 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 61594 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 61654 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 61811 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 61938 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 61949 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 61962 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 62194 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 62271 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 65877 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 65988 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66086 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66104 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66181 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66208 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66241 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66436 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66463 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66496 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66613 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66636 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66647 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66660 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66743 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66754 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 66767 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67043 + Length: 9 + ReplacementText: 'sycl::fabs((float)xi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67153 + Length: 56 + ReplacementText: 'sycl::fmax(amax, dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), amax, mask))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67226 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67349 + Length: 14 + ReplacementText: 'sycl::round(xi / d)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67453 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67460 + Length: 10 + ReplacementText: 'y[ib].ds.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67498 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67505 + Length: 10 + ReplacementText: 'y[ib].ds.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 67614 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68030 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68072 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68083 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68096 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68132 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68143 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68156 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68190 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68201 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68214 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68254 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68265 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68278 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68848 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68890 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 68947 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69371 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69412 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69423 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69436 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69469 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69480 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69493 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69527 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69538 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69551 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69591 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69602 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 69615 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70018 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70116 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70138 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70149 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70164 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70509 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70545 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70789 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70800 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70904 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 70924 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71239 + Length: 27 + ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71283 + Length: 27 + ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71329 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71343 + Length: 19 + ReplacementText: 'ds8.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71454 + Length: 6 + ReplacementText: 'ds8f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71479 + Length: 6 + ReplacementText: 'ds8f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71637 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71648 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71734 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71753 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 71773 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72088 + Length: 27 + ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72132 + Length: 27 + ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72323 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72337 + Length: 19 + ReplacementText: 'dm4.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72368 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72382 + Length: 19 + ReplacementText: 'ds8.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72426 + Length: 6 + ReplacementText: 'dm4f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72435 + Length: 6 + ReplacementText: 'ds8f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72466 + Length: 6 + ReplacementText: 'dm4f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72475 + Length: 6 + ReplacementText: 'ds8f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72814 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72825 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72946 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 72966 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 73452 + Length: 27 + ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 73861 + Length: 27 + ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 73947 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 73961 + Length: 19 + ReplacementText: 'ds8.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 74073 + Length: 6 + ReplacementText: 'ds8f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 74099 + Length: 6 + ReplacementText: 'ds8f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 74257 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 74268 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 74371 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 74390 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 74410 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 74896 + Length: 27 + ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75305 + Length: 27 + ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75536 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75550 + Length: 19 + ReplacementText: 'dm5.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75581 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75595 + Length: 19 + ReplacementText: 'ds8.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75639 + Length: 6 + ReplacementText: 'dm5f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75648 + Length: 6 + ReplacementText: 'ds8f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75679 + Length: 6 + ReplacementText: 'dm5f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 75688 + Length: 6 + ReplacementText: 'ds8f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76007 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76018 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76145 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76359 + Length: 24 + ReplacementText: 'dpct::dp4a(v[i], u[i], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76511 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76522 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76608 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76627 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76647 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 76861 + Length: 24 + ReplacementText: 'dpct::dp4a(v[i], u[i], sumi)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77049 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77063 + Length: 19 + ReplacementText: 'dm8.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77094 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77108 + Length: 19 + ReplacementText: 'ds8.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77152 + Length: 6 + ReplacementText: 'dm8f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77161 + Length: 6 + ReplacementText: 'ds8f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77192 + Length: 6 + ReplacementText: 'dm8f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77201 + Length: 6 + ReplacementText: 'ds8f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77525 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77536 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77681 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 77732 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78032 + Length: 19 + ReplacementText: 'dpct::dp4a(vi, u[i], 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78213 + Length: 18 + ReplacementText: 'dpct::dp4a(m, u[i], 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78305 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78319 + Length: 19 + ReplacementText: 'dm2.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78352 + Length: 6 + ReplacementText: 'dm2f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78368 + Length: 6 + ReplacementText: 'dm2f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78479 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78490 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78647 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 78685 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79137 + Length: 29 + ReplacementText: 'dpct::dp4a(v[i], u[i], sumi_d_sc)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79212 + Length: 26 + ReplacementText: 'dpct::dp4a(m, u[i], sumi_m)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79348 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79362 + Length: 19 + ReplacementText: 'dm2.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79401 + Length: 6 + ReplacementText: 'dm2f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79417 + Length: 6 + ReplacementText: 'dm2f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79588 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79599 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 79837 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 80584 + Length: 19 + ReplacementText: 'dpct::vectorized_binary(vil, vih, dpct::sub_sat())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 80631 + Length: 19 + ReplacementText: 'dpct::dp4a(vi, u[i], 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 80803 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 80814 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 81007 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 81288 + Length: 27 + ReplacementText: 'dpct::dp4a(v[i], u[i], sumi_sc)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 81585 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 81596 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 81782 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 81833 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82152 + Length: 47 + ReplacementText: 'dpct::dp4a(v1i, u[2*i+1], dpct::dp4a(v0i, u[2*i+0], 0))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82246 + Length: 61 + ReplacementText: 'dpct::dp4a(0x01010101, u[2*i+1], dpct::dp4a(0x01010101, u[2*i+0], 0))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82481 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82495 + Length: 19 + ReplacementText: 'dm4.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82528 + Length: 6 + ReplacementText: 'dm4f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82544 + Length: 6 + ReplacementText: 'dm4f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82656 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82667 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82852 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82871 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 82904 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83216 + Length: 60 + ReplacementText: 'dpct::dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83323 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83337 + Length: 22 + ReplacementText: 'ds8[i].convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83380 + Length: 6 + ReplacementText: 'ds8f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83425 + Length: 6 + ReplacementText: 'ds8f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83495 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83509 + Length: 19 + ReplacementText: 'dm4.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83542 + Length: 6 + ReplacementText: 'dm4f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83558 + Length: 6 + ReplacementText: 'dm4f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83729 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83740 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 83956 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84007 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84524 + Length: 47 + ReplacementText: 'dpct::dp4a(v0i, u[2*i+0], dpct::dp4a(v1i, u[2*i+1], 0))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84618 + Length: 61 + ReplacementText: 'dpct::dp4a(0x01010101, u[2*i+0], dpct::dp4a(0x01010101, u[2*i+1], 0))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84795 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84809 + Length: 19 + ReplacementText: 'dm5.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84842 + Length: 6 + ReplacementText: 'dm5f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84858 + Length: 6 + ReplacementText: 'dm5f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84970 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 84981 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85166 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85185 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85218 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85530 + Length: 46 + ReplacementText: 'dpct::dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85623 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85637 + Length: 22 + ReplacementText: 'ds8[i].convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85680 + Length: 6 + ReplacementText: 'ds8f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85725 + Length: 6 + ReplacementText: 'ds8f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85795 + Length: 6 + ReplacementText: 'sycl::float2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85809 + Length: 19 + ReplacementText: 'dm4.convert()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85842 + Length: 6 + ReplacementText: 'dm4f.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 85858 + Length: 6 + ReplacementText: 'dm4f.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 86029 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 86040 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 86250 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 86581 + Length: 34 + ReplacementText: 'dpct::vectorized_binary((vil | vih), 0x20202020, dpct::sub_sat())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 86668 + Length: 19 + ReplacementText: 'dpct::dp4a(vi, u[i], 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 86837 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 86848 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87050 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87236 + Length: 4 + ReplacementText: 'sycl::int2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87362 + Length: 8 + ReplacementText: 'sumi_d.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87373 + Length: 36 + ReplacementText: 'dpct::dp4a(v[2*i+0], u[2*i+0], sumi_d.x())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87443 + Length: 8 + ReplacementText: 'sumi_d.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87454 + Length: 36 + ReplacementText: 'dpct::dp4a(v[2*i+1], u[2*i+1], sumi_d.x())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87525 + Length: 8 + ReplacementText: 'sumi_d.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87536 + Length: 36 + ReplacementText: 'dpct::dp4a(v[2*i+4], u[2*i+4], sumi_d.y())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87606 + Length: 8 + ReplacementText: 'sumi_d.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87617 + Length: 36 + ReplacementText: 'dpct::dp4a(v[2*i+5], u[2*i+5], sumi_d.y())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87727 + Length: 8 + ReplacementText: 'sumi_d.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87749 + Length: 8 + ReplacementText: 'sumi_d.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87863 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 87874 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88517 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88528 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88582 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88621 + Length: 0 + ReplacementText: ",\n int *tile_x_qs_q4_0,\n float *tile_x_d_q4_0" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88658 + Length: 66 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88729 + Length: 72 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88844 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88928 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 88939 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 89036 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 89674 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 90202 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 90375 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 90386 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 90476 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 90606 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 91301 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 91312 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 91963 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 91974 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 92028 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 92067 + Length: 0 + ReplacementText: ",\n int *tile_x_qs_q4_1,\n sycl::half2 *tile_x_dm_q4_1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 92104 + Length: 67 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 92176 + Length: 73 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 92367 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 92378 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 92475 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 93077 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 93545 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 93718 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 93729 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 93819 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 93949 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 94595 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 94606 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95360 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95371 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95425 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95464 + Length: 0 + ReplacementText: ",\n int *tile_x_ql_q5_0,\n float *tile_x_d_q5_0" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95501 + Length: 66 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95572 + Length: 72 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95687 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95771 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95782 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 95879 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 96481 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 96997 + Length: 26 + ReplacementText: 'dpct::vectorized_binary(qs0, 0x10101010, dpct::sub_sat())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 97388 + Length: 26 + ReplacementText: 'dpct::vectorized_binary(qs1, 0x10101010, dpct::sub_sat())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 97814 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 97987 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 97998 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 98088 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 98218 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 98999 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 99010 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 99779 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 99790 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 99844 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 99883 + Length: 0 + ReplacementText: ",\n int *tile_x_ql_q5_1,\n sycl::half2 *tile_x_dm_q5_1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 99920 + Length: 67 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 99992 + Length: 73 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 100183 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 100194 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 100291 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 100892 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 102055 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 102228 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 102239 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 102329 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 102459 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103145 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103156 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103671 + Length: 21 + ReplacementText: 'bq8_1->ds[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103726 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103737 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103791 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103830 + Length: 0 + ReplacementText: ",\n int *tile_x_qs_q8_0,\n float *tile_x_d_q8_0" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103867 + Length: 66 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 103938 + Length: 72 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 104053 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 104137 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 104148 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 104245 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 104883 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 105342 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 105515 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 105526 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 105616 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 105746 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 106175 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 106186 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 106823 + Length: 36 + ReplacementText: 'bq8_1[bq8_offset + i].ds[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 106968 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 106979 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 107033 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 107072 + Length: 0 + ReplacementText: ",\n int *tile_x_ql_q2_K,\n sycl::half2 *tile_x_dm_q2_K,\n int *tile_x_sc_q2_K" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 107097 + Length: 67 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 107169 + Length: 73 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 107247 + Length: 69 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 107462 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 107473 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 107570 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 108160 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 108638 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 108968 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 109220 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 109231 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 109321 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 109451 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 110293 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 110304 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111071 + Length: 36 + ReplacementText: 'bq8_1[bq8_offset + i].ds[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111234 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111245 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111299 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111338 + Length: 0 + ReplacementText: ",\n int *tile_x_ql_q3_K,\n sycl::half2 *tile_x_dm_q3_K,\n int *tile_x_qh_q3_K,\n int *tile_x_sc_q3_K" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111347 + Length: 67 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111419 + Length: 73 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111497 + Length: 69 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111571 + Length: 69 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111814 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111825 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 111922 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 112496 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 113002 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 113332 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 113813 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 114376 + Length: 39 + ReplacementText: 'dpct::vectorized_binary(sc_low | sc_high, 0x20202020, dpct::sub_sat())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 114500 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 114511 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 114601 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 114731 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 115560 + Length: 19 + ReplacementText: 'dpct::vectorized_binary(vll, vlh, dpct::sub_sat())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 115794 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 115805 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 117199 + Length: 20 + ReplacementText: 'bq8i->ds[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 118931 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 118942 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 118996 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 119035 + Length: 0 + ReplacementText: ",\n int *tile_x_ql_q4_K,\n sycl::half2 *tile_x_dm_q4_K,\n int *tile_x_sc_q4_K" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 119060 + Length: 67 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 119132 + Length: 73 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 119210 + Length: 69 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 119425 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 119436 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 119533 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 120169 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 120702 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 121154 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 121766 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 121777 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 121867 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 121997 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 122461 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 122472 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 123675 + Length: 21 + ReplacementText: 'bq8i->ds[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125442 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125453 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125507 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125546 + Length: 0 + ReplacementText: ",\n int *tile_x_ql_q5_K,\n sycl::half2 *tile_x_dm_q5_K,\n int *tile_x_sc_q5_K" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125571 + Length: 67 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125643 + Length: 73 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125721 + Length: 69 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125936 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 125947 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 126044 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 126680 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 127820 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 128183 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 128795 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 128806 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 128896 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 129026 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 129550 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 129561 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 130421 + Length: 38 + ReplacementText: 'bq8_1[bq8_offset + 2*i].ds[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 130572 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 130583 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 130637 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 130676 + Length: 0 + ReplacementText: ",\n int *tile_x_ql,\n sycl::half2 *tile_x_dm,\n int *tile_x_sc" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 130701 + Length: 62 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 130768 + Length: 68 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 130841 + Length: 64 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 131036 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 131047 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 131144 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 131780 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 132523 + Length: 32 + ReplacementText: 'dpct::vectorized_binary(ql0 | qh0, 0x20202020, dpct::sub_sat())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 132601 + Length: 32 + ReplacementText: 'dpct::vectorized_binary(ql1 | qh1, 0x20202020, dpct::sub_sat())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 133031 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 133371 + Length: 13 + ReplacementText: 'sycl::min(i, i_max)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 133606 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 133617 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 133707 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 133837 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 134572 + Length: 0 + ReplacementText: "/*\nDPCT1110:8: The total declared local variable size in device function mul_mat_q exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 134579 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 134590 + Length: 15 + ReplacementText: __dpct_inline__ + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 134837 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 134892 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_y_qs, sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 135212 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 135294 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 135354 + Length: 47 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 135406 + Length: 53 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 135710 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 135742 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 135875 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136038 + Length: 41 + ReplacementText: 'dpct::min((unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i), ncols_y-1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136263 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136383 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136546 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136568 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136642 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136713 + Length: 29 + ReplacementText: 'sycl::min(col_y_0 + ids, ncols_y-1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136867 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 136984 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 137237 + Length: 20 + ReplacementText: '(*dsi_src)[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 137292 + Length: 0 + ReplacementText: " /*\n DPCT1118:9: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 137304 + Length: 15 + ReplacementText: "/*\n DPCT1065:56: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 137814 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 137831 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 137907 + Length: 0 + ReplacementText: " /*\n DPCT1118:10: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 137919 + Length: 15 + ReplacementText: "/*\n DPCT1065:57: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 138058 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 138251 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 138973 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 139462 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0,\n int *tile_y_qs, sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 139503 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 140376 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 140608 + Length: 0 + ReplacementText: ', tile_x_qs_q4_0, tile_x_d_q4_0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 140895 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 140905 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 142059 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 142278 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 142600 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1,\n sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 142641 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 143513 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 143745 + Length: 0 + ReplacementText: ', tile_x_qs_q4_1, tile_x_dm_q4_1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 144032 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 144042 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 145195 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 145684 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0,\n int *tile_y_qs, sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 145725 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 146597 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 146829 + Length: 0 + ReplacementText: ', tile_x_ql_q5_0, tile_x_d_q5_0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 147117 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 147127 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 148281 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 148766 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1,\n sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 148807 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 149678 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 149910 + Length: 0 + ReplacementText: ', tile_x_ql_q5_1, tile_x_dm_q5_1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 150197 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 150207 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 151360 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 151849 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0,\n int *tile_y_qs, sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 151890 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 152762 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 152994 + Length: 0 + ReplacementText: ', tile_x_qs_q8_0, tile_x_d_q8_0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 153282 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 153292 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 154447 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 154932 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K,\n sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs,\n sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 154973 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 155844 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 156076 + Length: 0 + ReplacementText: ', tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 156363 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 156373 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 157528 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 157747 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 158069 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,\n sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K,\n int *tile_y_qs, sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 158110 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 158981 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 159213 + Length: 0 + ReplacementText: ', tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K, tile_x_sc_q3_K' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 159500 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 159510 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 160663 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 160882 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 161204 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K,\n sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs,\n sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 161245 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 162115 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 162347 + Length: 0 + ReplacementText: ', tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 162633 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 162643 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 163795 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 164280 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K,\n sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs,\n sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 164321 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 165191 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 165423 + Length: 0 + ReplacementText: ', tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 165709 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 165719 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 166870 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 167089 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 167411 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,\n int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 167452 + Length: 5 + ReplacementText: 'sycl::half2' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 168323 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 168555 + Length: 0 + ReplacementText: ', tile_x_ql, tile_x_dm, tile_x_sc' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 168842 + Length: 0 + ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 168852 + Length: 13 + ReplacementText: DPCT_COMPATIBILITY_TEMP + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 169571 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 169719 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 169743 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 169754 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 169767 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 170198 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 170269 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 170378 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 170646 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 170705 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 170830 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 170988 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 171120 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 171131 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 171144 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 171225 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 172594 + Length: 3 + ReplacementText: 'v.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 172645 + Length: 3 + ReplacementText: 'v.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 172852 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173048 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173262 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173277 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173295 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173330 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173341 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173354 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173391 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173402 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173415 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173670 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173719 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 173938 + Length: 19 + ReplacementText: 'sycl::vec(x[ix]).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174346 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174405 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174466 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174741 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174756 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174774 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174813 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174824 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174837 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174876 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174887 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 174900 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 175200 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 175249 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 175513 + Length: 19 + ReplacementText: 'sycl::vec(x[ix]).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 175696 + Length: 42 + ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 175755 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 175816 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 175988 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176100 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176115 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176143 + Length: 17 + ReplacementText: 'sycl::vec(*xi).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176172 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176246 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176265 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176282 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176297 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176370 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176670 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176692 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176703 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 176716 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 177406 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 177689 + Length: 21 + ReplacementText: 'sycl::fmax(amax, sycl::fabs((float)v))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 177917 + Length: 10 + ReplacementText: 'sycl::round((float)x0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 177945 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 178239 + Length: 8 + ReplacementText: 'sycl::fabs((float)v)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 178270 + Length: 8 + ReplacementText: 'sycl::fabs((float)v)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 178569 + Length: 28 + ReplacementText: 'dpct::min(15, (int8_t)(x0 + 8.5f))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 178627 + Length: 28 + ReplacementText: 'dpct::min(15, (int8_t)(x1 + 8.5f))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 178735 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179187 + Length: 10 + ReplacementText: 'dsti->dm.x()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179207 + Length: 10 + ReplacementText: 'dsti->dm.y()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179404 + Length: 28 + ReplacementText: 'dpct::min(15, (int8_t)(x0 + 0.5f))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179462 + Length: 28 + ReplacementText: 'dpct::min(15, (int8_t)(x1 + 0.5f))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179611 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179905 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179928 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179939 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 179952 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 180465 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 180585 + Length: 23 + ReplacementText: 'sycl::max(0.001f, high - low)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 180628 + Length: 23 + ReplacementText: 'sycl::min(1.0f, sycl::max(0.0f, y))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 180878 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 181499 + Length: 23 + ReplacementText: 'sycl::log(1.0f / freq_scale)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 181547 + Length: 11 + ReplacementText: 'sycl::cos(theta)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 181586 + Length: 11 + ReplacementText: 'sycl::sin(theta)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 181700 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 181899 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 181926 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 181937 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 181950 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182032 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182043 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182056 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182214 + Length: 34 + ReplacementText: 'dpct::pow(freq_base, -float(col)/ncols)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182588 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182823 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182850 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182861 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182874 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182956 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182967 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 182980 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 183427 + Length: 27 + ReplacementText: 'dpct::pow(theta_scale, col/2.0f)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 183784 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 183947 + Length: 0 + ReplacementText: ', const sycl::nd_item<3> &item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 183971 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 183982 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 183995 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184119 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184130 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184143 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184263 + Length: 32 + ReplacementText: 'dpct::pow(freq_base, -2.0f*col/ncols)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184406 + Length: 17 + ReplacementText: 'sycl::min(p, n_ctx - 2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184480 + Length: 11 + ReplacementText: 'sycl::sin((float)theta)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184521 + Length: 11 + ReplacementText: 'sycl::cos((float)theta)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184759 + Length: 21 + ReplacementText: 'sycl::max(p - n_ctx - 2, 0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184833 + Length: 17 + ReplacementText: 'sycl::sin((float)block_theta)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 184886 + Length: 17 + ReplacementText: 'sycl::cos((float)block_theta)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185151 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185335 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185359 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185370 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185383 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185464 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185475 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185488 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185631 + Length: 15 + ReplacementText: 'dpct::pow(m0, k + 1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185675 + Length: 42 + ReplacementText: 'dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185767 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185843 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185867 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185899 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 185973 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 186058 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 186150 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 186268 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 186345 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 186383 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 186410 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 186624 + Length: 15 + ReplacementText: "/*\n DPCT1065:58: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187320 + Length: 0 + ReplacementText: " /*\n DPCT1118:11: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187332 + Length: 15 + ReplacementText: "/*\n DPCT1065:59: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187375 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187500 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187524 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187535 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187548 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187581 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187592 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187605 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 187993 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188122 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1, float *buf" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188147 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188181 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188305 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188342 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188391 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188421 + Length: 57 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188670 + Length: 46 + ReplacementText: 'sycl::max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188801 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188914 + Length: 0 + ReplacementText: " /*\n DPCT1118:12: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 188922 + Length: 15 + ReplacementText: "/*\n DPCT1065:60: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189014 + Length: 0 + ReplacementText: " /*\n DPCT1118:13: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189022 + Length: 15 + ReplacementText: "/*\n DPCT1065:61: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189113 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189311 + Length: 50 + ReplacementText: 'sycl::native::exp((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189483 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189590 + Length: 0 + ReplacementText: " /*\n DPCT1118:14: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189598 + Length: 15 + ReplacementText: "/*\n DPCT1065:62: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189686 + Length: 0 + ReplacementText: " /*\n DPCT1118:15: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189694 + Length: 15 + ReplacementText: "/*\n DPCT1065:63: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n */\n item_ct1.barrier()" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189773 + Length: 0 + ReplacementText: ', item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 189962 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190048 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190070 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190081 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190094 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190186 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190287 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190309 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190320 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190333 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190458 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190515 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190671 + Length: 0 + ReplacementText: ",\n const sycl::nd_item<3> &item_ct1" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190693 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190707 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 190720 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191022 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191095 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191134 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191257 + Length: 18 + ReplacementText: 'sycl::vec(0.0f).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191325 + Length: 10 + ReplacementText: 'item_ct1.get_group(0)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191378 + Length: 44 + ReplacementText: 'sycl::vec(x[offset_src + iih * IW + iiw]).convert()[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191670 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191738 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191754 + Length: 30 + ReplacementText: 1, 1, CUDA_GET_ROWS_BLOCK_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191897 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 191913 + Length: 28 + ReplacementText: 'ne11*ne12, ne10, block_num_x' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 192443 + Length: 294 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n k_get_rows(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 192737 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 192985 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 193053 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 193069 + Length: 30 + ReplacementText: 1, 1, CUDA_GET_ROWS_BLOCK_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 193206 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 193222 + Length: 28 + ReplacementText: 'ne11*ne12, ne10, block_num_x' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 193719 + Length: 288 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 194007 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 194368 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196599 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196614 + Length: 0 + ReplacementText: '(1, 1, 1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196628 + Length: 12 + ReplacementText: 'block_dims[2]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196697 + Length: 12 + ReplacementText: 'block_dims[1]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196712 + Length: 54 + ReplacementText: 'std::min(ne1, block_size / (unsigned int)block_dims[2])' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196780 + Length: 12 + ReplacementText: 'block_dims[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196795 + Length: 88 + ReplacementText: 'std::min(std::min(ne2*ne3, block_size / (unsigned int)block_dims[2] / (unsigned int)block_dims[1]), 64U)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196898 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 196914 + Length: 188 + ReplacementText: '(ne2*ne3 + block_dims[0] - 1) / block_dims[0], (ne1 + block_dims[1] - 1) / block_dims[1], (hne0 + block_dims[2] - 1) / block_dims[2]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 197122 + Length: 12 + ReplacementText: 'block_nums[0]' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 197342 + Length: 284 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size), sycl::range<3>(1, 1, block_size)), \n [=](sycl::nd_item<3> item_ct1) {\n k_bin_bcast_unravel(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, s13, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 197626 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 197649 + Length: 0 + ReplacementText: " /*\n DPCT1049:16: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 197665 + Length: 277 + ReplacementText: " dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, s13, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 197942 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198176 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198286 + Length: 114 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198400 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198474 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198583 + Length: 68 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n gelu_f32(x, dst, k, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198651 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198725 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198834 + Length: 68 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n silu_f32(x, dst, k, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198902 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 198982 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199091 + Length: 74 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n gelu_quick_f32(x, dst, k, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199165 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199239 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199348 + Length: 68 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n tanh_f32(x, dst, k, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199416 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199490 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199599 + Length: 68 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n relu_f32(x, dst, k, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199667 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199775 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199884 + Length: 90 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n leaky_relu_f32(x, dst, k, negative_slope, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 199974 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200047 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200154 + Length: 66 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n sqr_f32(x, dst, k, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200220 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200332 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200434 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200450 + Length: 15 + ReplacementText: 1, 1, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200476 + Length: 73 + ReplacementText: "stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n norm_f32(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200549 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200578 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200594 + Length: 10 + ReplacementText: 1, 1, 1024 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200607 + Length: 0 + ReplacementText: " /*\n DPCT1049:17: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200615 + Length: 68 + ReplacementText: "stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n norm_f32<1024>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200683 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200823 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200925 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200941 + Length: 15 + ReplacementText: 1, 1, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 200967 + Length: 102 + ReplacementText: "stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n const float eps_ct4 = eps;\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, s_sum_acc_ct1.get_pointer());\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201069 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201098 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201114 + Length: 10 + ReplacementText: 1, 1, 1024 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201127 + Length: 0 + ReplacementText: " /*\n DPCT1049:18: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201135 + Length: 97 + ReplacementText: "stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n const float eps_ct4 = eps;\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n group_norm_f32<1024>(x, dst, group_size, ne_elements, eps_ct4, item_ct1, s_sum_acc_ct1.get_pointer());\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201232 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201361 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201470 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201483 + Length: 20 + ReplacementText: ne2, ne1, num_blocks + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201510 + Length: 80 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n concat_f32(x, y, dst, ne0, ne02, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201590 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201726 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201874 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201887 + Length: 39 + ReplacementText: 'ne02, (ne01 * scale_factor), num_blocks' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 201933 + Length: 101 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202034 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202195 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202298 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202311 + Length: 20 + ReplacementText: ne2, ne1, num_blocks + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202338 + Length: 83 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202421 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202537 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202639 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202655 + Length: 15 + ReplacementText: 1, 1, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202681 + Length: 77 + ReplacementText: "stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n rms_norm_f32(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202758 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202787 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202803 + Length: 10 + ReplacementText: 1, 1, 1024 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202816 + Length: 0 + ReplacementText: " /*\n DPCT1049:19: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202824 + Length: 72 + ReplacementText: "stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 202896 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203019 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203151 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203167 + Length: 18 + ReplacementText: 1, ky, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203198 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203214 + Length: 32 + ReplacementText: 1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203253 + Length: 74 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(num_blocks * block_size, block_size), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n quantize_q8_1(x, vy, kx, kx_padded, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203327 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203514 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203635 + Length: 108 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_block(vx, y, k, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203743 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203851 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203923 + Length: 51 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_block_q2_K(vx, y, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 203974 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204152 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204224 + Length: 51 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_block_q3_K(vx, y, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204275 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204453 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204509 + Length: 51 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_block_q4_K(vx, y, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204560 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204668 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204740 + Length: 51 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_block_q5_K(vx, y, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204791 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 204969 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 205041 + Length: 51 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n [=](sycl::nd_item<3> item_ct1) {\n dequantize_block_q6_K(vx, y, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 205092 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 207504 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 207771 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 207787 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 207817 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 207833 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 207869 + Length: 127 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 207996 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208128 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208286 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208302 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208332 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208348 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208384 + Length: 127 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208511 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208643 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208801 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208817 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208847 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208863 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 208899 + Length: 127 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209026 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209158 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209316 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209332 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209362 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209378 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209414 + Length: 127 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209541 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209673 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209831 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209847 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209877 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209893 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 209929 + Length: 127 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec(vx, y, dst, ncols, nrows, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210056 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210187 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210397 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210413 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210443 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210459 + Length: 9 + ReplacementText: 1, ny, 32 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210475 + Length: 92 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210567 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210698 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210865 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210881 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210911 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210927 + Length: 9 + ReplacementText: 1, ny, 32 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 210943 + Length: 92 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211035 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211166 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211333 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211349 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211379 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211395 + Length: 9 + ReplacementText: 1, ny, 32 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211411 + Length: 92 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211503 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211634 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211703 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211719 + Length: 8 + ReplacementText: 1, 1, 32 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211734 + Length: 80 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211814 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 211945 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212112 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212128 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212158 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212174 + Length: 9 + ReplacementText: 1, ny, 32 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212190 + Length: 92 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212282 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212410 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212568 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212584 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212614 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212630 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212666 + Length: 115 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212781 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 212906 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213053 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213069 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213099 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213115 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213151 + Length: 153 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213304 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213429 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213576 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213592 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213622 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213638 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213674 + Length: 153 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213827 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 213952 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214099 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214115 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214145 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214161 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214197 + Length: 153 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214350 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214475 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214622 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214638 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214668 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214684 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214720 + Length: 153 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214873 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 214998 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215145 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215161 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215191 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215207 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215243 + Length: 153 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215396 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215521 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215667 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215683 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215713 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215729 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215765 + Length: 152 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 215917 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216042 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216188 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216204 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216234 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216250 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216286 + Length: 152 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216438 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216563 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216709 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216725 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216755 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216771 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216807 + Length: 152 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 216959 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217084 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217230 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217246 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217276 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217292 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217328 + Length: 152 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217480 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217605 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217751 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217767 + Length: 17 + ReplacementText: 1, 1, block_num_y + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217797 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217813 + Length: 29 + ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 217849 + Length: 152 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_q(vx, vy, dst, ncols, nrows, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 218001 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 218199 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 218220 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 218250 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219170 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219186 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219226 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219242 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219337 + Length: 0 + ReplacementText: " /*\n DPCT1049:20: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219345 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_qs_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_d_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q4_0(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_0_acc_ct1.get_pointer(), tile_x_d_q4_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219481 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219534 + Length: 0 + ReplacementText: " /*\n DPCT1049:21: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219542 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_qs_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_d_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q4_0(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_0_acc_ct1.get_pointer(), tile_x_d_q4_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219678 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219687 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219882 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219903 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 219933 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 220853 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 220869 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 220909 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 220925 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221020 + Length: 0 + ReplacementText: " /*\n DPCT1049:22: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221028 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_qs_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q4_1(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_1_acc_ct1.get_pointer(), tile_x_dm_q4_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221164 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221217 + Length: 0 + ReplacementText: " /*\n DPCT1049:23: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221225 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_qs_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q4_1(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_1_acc_ct1.get_pointer(), tile_x_dm_q4_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221361 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221370 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221565 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221586 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 221616 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222536 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222552 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222592 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222608 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222703 + Length: 0 + ReplacementText: " /*\n DPCT1049:24: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222711 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q5_0_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_d_q5_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q5_0(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_0_acc_ct1.get_pointer(), tile_x_d_q5_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222847 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222900 + Length: 0 + ReplacementText: " /*\n DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 222908 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q5_0_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_d_q5_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q5_0(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_0_acc_ct1.get_pointer(), tile_x_d_q5_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 223044 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 223053 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 223248 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 223269 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 223299 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224219 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224235 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224275 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224291 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224386 + Length: 0 + ReplacementText: " /*\n DPCT1049:26: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224394 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q5_1_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q5_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q5_1(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_1_acc_ct1.get_pointer(), tile_x_dm_q5_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224530 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224583 + Length: 0 + ReplacementText: " /*\n DPCT1049:27: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224591 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q5_1_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q5_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q5_1(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_1_acc_ct1.get_pointer(), tile_x_dm_q5_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224727 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224736 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224931 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224952 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 224982 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 225902 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 225918 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 225958 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 225974 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226069 + Length: 0 + ReplacementText: " /*\n DPCT1049:28: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226077 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_qs_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_d_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q8_0(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q8_0_acc_ct1.get_pointer(), tile_x_d_q8_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226213 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226266 + Length: 0 + ReplacementText: " /*\n DPCT1049:29: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226274 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_qs_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_d_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q8_0(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q8_0_acc_ct1.get_pointer(), tile_x_d_q8_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226410 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226419 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226614 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226635 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 226665 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227585 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227601 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227641 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227657 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227752 + Length: 0 + ReplacementText: " /*\n DPCT1049:30: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227760 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K), cgh);\n sycl::local_accessor tile_x_sc_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4) + mmq_y/4), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q2_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q2_K_acc_ct1.get_pointer(), tile_x_dm_q2_K_acc_ct1.get_pointer(), tile_x_sc_q2_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227896 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227949 + Length: 0 + ReplacementText: " /*\n DPCT1049:31: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 227957 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K), cgh);\n sycl::local_accessor tile_x_sc_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4) + mmq_y/4), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q2_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q2_K_acc_ct1.get_pointer(), tile_x_dm_q2_K_acc_ct1.get_pointer(), tile_x_sc_q2_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 228093 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 228102 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 228297 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 228318 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 228365 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229285 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229301 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229341 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229357 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229452 + Length: 0 + ReplacementText: " /*\n DPCT1049:32: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229460 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K), cgh);\n sycl::local_accessor tile_x_qh_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/2) + mmq_y/2), cgh);\n sycl::local_accessor tile_x_sc_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4) + mmq_y/4), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q3_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q3_K_acc_ct1.get_pointer(), tile_x_dm_q3_K_acc_ct1.get_pointer(), tile_x_qh_q3_K_acc_ct1.get_pointer(), tile_x_sc_q3_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229596 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229649 + Length: 0 + ReplacementText: " /*\n DPCT1049:33: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229657 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K), cgh);\n sycl::local_accessor tile_x_qh_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/2) + mmq_y/2), cgh);\n sycl::local_accessor tile_x_sc_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4) + mmq_y/4), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q3_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q3_K_acc_ct1.get_pointer(), tile_x_dm_q3_K_acc_ct1.get_pointer(), tile_x_qh_q3_K_acc_ct1.get_pointer(), tile_x_sc_q3_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229793 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 229809 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 230004 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 230025 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 230055 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 230975 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 230991 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231031 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231047 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231142 + Length: 0 + ReplacementText: " /*\n DPCT1049:34: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231150 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K), cgh);\n sycl::local_accessor tile_x_sc_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8) + mmq_y/8), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q4_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q4_K_acc_ct1.get_pointer(), tile_x_dm_q4_K_acc_ct1.get_pointer(), tile_x_sc_q4_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231286 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231339 + Length: 0 + ReplacementText: " /*\n DPCT1049:35: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231347 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K), cgh);\n sycl::local_accessor tile_x_sc_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8) + mmq_y/8), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q4_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q4_K_acc_ct1.get_pointer(), tile_x_dm_q4_K_acc_ct1.get_pointer(), tile_x_sc_q4_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231483 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231492 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231687 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231708 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 231738 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 232658 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 232674 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 232714 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 232730 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 232825 + Length: 0 + ReplacementText: " /*\n DPCT1049:36: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 232833 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q5_K_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K), cgh);\n sycl::local_accessor tile_x_sc_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8) + mmq_y/8), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q5_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_K_acc_ct1.get_pointer(), tile_x_dm_q5_K_acc_ct1.get_pointer(), tile_x_sc_q5_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 232969 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 233022 + Length: 0 + ReplacementText: " /*\n DPCT1049:37: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 233030 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_q5_K_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K), cgh);\n sycl::local_accessor tile_x_sc_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8) + mmq_y/8), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q5_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_K_acc_ct1.get_pointer(), tile_x_dm_q5_K_acc_ct1.get_pointer(), tile_x_sc_q5_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 233166 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 233175 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 233370 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 233391 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 233421 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234341 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234357 + Length: 27 + ReplacementText: 1, block_num_y, block_num_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234397 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234413 + Length: 20 + ReplacementText: 1, nwarps, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234508 + Length: 0 + ReplacementText: " /*\n DPCT1049:38: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234516 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K), cgh);\n sycl::local_accessor tile_x_sc_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8) + mmq_y/8), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q6_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_acc_ct1.get_pointer(), tile_x_dm_acc_ct1.get_pointer(), tile_x_sc_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234652 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234705 + Length: 0 + ReplacementText: " /*\n DPCT1049:39: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234713 + Length: 136 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->submit(\n [&](sycl::handler &cgh) {\n sycl::local_accessor tile_x_ql_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE) + mmq_y), cgh);\n sycl::local_accessor tile_x_dm_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K), cgh);\n sycl::local_accessor tile_x_sc_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8) + mmq_y/8), cgh);\n sycl::local_accessor tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n sycl::local_accessor tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n mul_mat_q6_K(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_acc_ct1.get_pointer(), tile_x_dm_acc_ct1.get_pointer(), tile_x_sc_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234849 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 234858 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235043 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235077 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235093 + Length: 23 + ReplacementText: nchannels_y, nrows_x, 1 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235129 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235145 + Length: 15 + ReplacementText: 1, 1, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235167 + Length: 115 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235282 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235524 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235558 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235574 + Length: 23 + ReplacementText: nchannels_y, nrows_x, 1 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235610 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235626 + Length: 15 + ReplacementText: 1, 1, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235648 + Length: 157 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 235805 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 236061 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 236170 + Length: 157 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 236327 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 236583 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 236692 + Length: 157 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 236849 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 237106 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 237207 + Length: 148 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n [=](sycl::nd_item<3> item_ct1) {\n cpy_f32_q(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 237355 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 237612 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 237713 + Length: 148 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n [=](sycl::nd_item<3> item_ct1) {\n cpy_f32_q(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 237861 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 238118 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 238219 + Length: 148 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n [=](sycl::nd_item<3> item_ct1) {\n cpy_f32_q(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 238367 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 238623 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 238732 + Length: 157 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 238889 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 238983 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239094 + Length: 77 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n scale_f32(x, dst, scale, k, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239171 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239280 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239391 + Length: 80 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n clamp_f32(x, dst, min, max, k, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239471 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239709 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239776 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239925 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239941 + Length: 22 + ReplacementText: 1, num_blocks_x, nrows + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 239992 + Length: 0 + ReplacementText: " /*\n DPCT1049:40: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240000 + Length: 168 + ReplacementText: " dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n rope(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240168 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240183 + Length: 0 + ReplacementText: " /*\n DPCT1049:41: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240191 + Length: 167 + ReplacementText: " dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n rope(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240358 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240619 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240686 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240835 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 240851 + Length: 22 + ReplacementText: 1, num_blocks_x, nrows + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241009 + Length: 0 + ReplacementText: " /*\n DPCT1049:42: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241017 + Length: 206 + ReplacementText: " dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n rope_neox(x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, theta_scale, inv_ndims, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241223 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241238 + Length: 0 + ReplacementText: " /*\n DPCT1049:43: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241246 + Length: 205 + ReplacementText: " dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n rope_neox(x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, theta_scale, inv_ndims, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241451 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241638 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241705 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241721 + Length: 28 + ReplacementText: '1, 1, CUDA_ROPE_BLOCK_SIZE/4' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241850 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241866 + Length: 22 + ReplacementText: 1, nrows, num_blocks_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 241895 + Length: 115 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n rope_glm_f32(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242010 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242240 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242273 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242289 + Length: 27 + ReplacementText: 1, 1, CUDA_ALIBI_BLOCK_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242421 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242437 + Length: 22 + ReplacementText: 1, nrows, num_blocks_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242466 + Length: 99 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n alibi_f32(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242565 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242664 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242697 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242713 + Length: 15 + ReplacementText: 1, 1, WARP_SIZE + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242741 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242775 + Length: 68 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n k_sum_rows_f32(x, dst, ncols, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242843 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 242966 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243097 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243113 + Length: 11 + ReplacementText: 1, 1, ncols + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243137 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243201 + Length: 0 + ReplacementText: " /*\n DPCT1049:44: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243209 + Length: 86 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n k_argsort_f32_i32(x, dst, ncols, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243295 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243339 + Length: 0 + ReplacementText: " /*\n DPCT1049:45: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243347 + Length: 87 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n k_argsort_f32_i32(x, dst, ncols, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243434 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243635 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243668 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243839 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243855 + Length: 23 + ReplacementText: 1, block_num_x, nrows_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243885 + Length: 99 + ReplacementText: "stream->parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n diag_mask_inf_f32(x, dst, ncols_x, rows_per_channel, n_past, item_ct1);\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 243984 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244142 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244270 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244286 + Length: 13 + ReplacementText: 1, 1, nth + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244312 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244328 + Length: 13 + ReplacementText: 1, 1, nrows_x + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244344 + Length: 0 + ReplacementText: " /*\n DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244348 + Length: 87 + ReplacementText: "stream->submit(\n [&](sycl::handler &cgh) {\n /*\n DPCT1101:96: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was replaced with a value. Modify the code to use the original expression, provided in comments, if it is correct.\n */\n sycl::local_accessor buf_acc_ct1(sycl::range<1>(32/*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_nums * block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1, buf_acc_ct1.get_pointer());\n });\n });" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244435 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244488 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244628 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244805 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244821 + Length: 18 + ReplacementText: IC, OH, num_blocks + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 244846 + Length: 166 + ReplacementText: "{\n dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n stream->parallel_for(\n sycl::nd_range<3>(block_nums * sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)), \n [=](sycl::nd_item<3> item_ct1) {\n im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1, item_ct1);\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 245012 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 245872 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 245946 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 247185 + Length: 43 + ReplacementText: 'DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(look_ahead_size, dpct::get_in_order_queue()))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 247595 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 247658 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 247732 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 248074 + Length: 13 + ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 248087 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 248125 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 248186 + Length: 0 + ReplacementText: "/*\nDPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.\n*/\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 248286 + Length: 11 + ReplacementText: 'dpct::device_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 248548 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 248622 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 249332 + Length: 0 + ReplacementText: " /*\n DPCT1082:65: Migration of CUmemAllocationProp type is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 249511 + Length: 0 + ReplacementText: " /*\n DPCT1082:66: Migration of CUmemGenericAllocationHandle type is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 249556 + Length: 0 + ReplacementText: " /*\n DPCT1007:69: Migration of cuMemCreate is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 249729 + Length: 0 + ReplacementText: " /*\n DPCT1007:70: Migration of cuMemAddressReserve is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 249877 + Length: 0 + ReplacementText: " /*\n DPCT1007:71: Migration of cuMemMap is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 250001 + Length: 0 + ReplacementText: " /*\n DPCT1082:72: Migration of CUmemAccessDesc type is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 250190 + Length: 0 + ReplacementText: " /*\n DPCT1007:73: Migration of cuMemSetAccess is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 250981 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251044 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251118 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251451 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251524 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251553 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251749 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251808 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251837 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 251999 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 253084 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 253419 + Length: 35 + ReplacementText: 'DPCT_CHECK_ERROR(g_device_count = dpct::dev_mgr::instance().device_count())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 253458 + Length: 11 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 254270 + Length: 8 + ReplacementText: int + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 254308 + Length: 24 + ReplacementText: 'DPCT_CHECK_ERROR(device = id)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 254335 + Length: 0 + ReplacementText: " /*\n DPCT1028:74: The cuDeviceGetAttribute was not migrated because parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is unsupported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 254488 + Length: 0 + ReplacementText: " /*\n DPCT1082:75: Migration of CUmemAllocationProp type is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 254723 + Length: 0 + ReplacementText: " /*\n DPCT1007:76: Migration of cuMemGetAllocationGranularity is not supported.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 254977 + Length: 14 + ReplacementText: 'dpct::device_info' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255021 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255045 + Length: 5 + ReplacementText: prop + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255052 + Length: 2 + ReplacementText: 'dpct::dev_mgr::instance().get_device(id)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255055 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255058 + Length: 0 + ReplacementText: " /*\n DPCT1005:77: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255151 + Length: 4 + ReplacementText: 'get_name()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255162 + Length: 5 + ReplacementText: 'get_major_version()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255174 + Length: 5 + ReplacementText: 'get_minor_version()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255286 + Length: 14 + ReplacementText: 'get_global_mem_size()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255454 + Length: 0 + ReplacementText: " /*\n DPCT1005:78: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255498 + Length: 5 + ReplacementText: 'get_major_version()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255514 + Length: 5 + ReplacementText: 'get_minor_version()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255906 + Length: 0 + ReplacementText: " /*\n DPCT1025:79: The SYCL queue is created ignoring the flag and priority options.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 255933 + Length: 72 + ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[id][is] = dpct::get_current_device().create_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 256084 + Length: 35 + ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = &dpct::get_in_order_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 256122 + Length: 0 + ReplacementText: " /*\n DPCT1027:80: The call to cublasSetMathMode was replaced with 0 because this functionality is redundant in SYCL.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 256147 + Length: 67 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 256401 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257037 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257152 + Length: 11 + ReplacementText: 'dpct::err0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257170 + Length: 36 + ReplacementText: 'DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257208 + Length: 0 + ReplacementText: " /*\n DPCT1000:82: Error handling if-stmt was detected but could not be rewritten.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257223 + Length: 11 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257265 + Length: 28 + ReplacementText: " /*\n DPCT1026:83: The call to cudaGetLastError was removed because this functionality is redundant in SYCL.\n */\n /*\n DPCT1001:81: The statement could not be removed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257379 + Length: 0 + ReplacementText: " /*\n DPCT1009:84: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257411 + Length: 23 + ReplacementText: '"cudaGetErrorString is not supported"/*cudaGetErrorString(err)*/' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257485 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257524 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257541 + Length: 17 + ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257558 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257562 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257571 + Length: 11 + ReplacementText: 'dpct::err0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257713 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257734 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257741 + Length: 14 + ReplacementText: 'dpct::memcpy_direction' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 257841 + Length: 22 + ReplacementText: 'dpct::host_to_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 258114 + Length: 24 + ReplacementText: 'dpct::device_to_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 258253 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 258879 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 258918 + Length: 6 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 258924 + Length: 8 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 258933 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 258978 + Length: 17 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259050 + Length: 0 + ReplacementText: '*' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259057 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259323 + Length: 11 + ReplacementText: 'dpct::err0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259339 + Length: 17 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259395 + Length: 0 + ReplacementText: '*' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259402 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259404 + Length: 0 + ReplacementText: " /*\n DPCT1001:85: The statement could not be removed.\n */\n /*\n DPCT1000:86: Error handling if-stmt was detected but could not be rewritten.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259425 + Length: 11 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259473 + Length: 11 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259493 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 259674 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 260139 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 261515 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 261845 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 261872 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 262015 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 262476 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 262832 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 263151 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 263965 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 264284 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 264604 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 265034 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 265470 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 265906 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 266336 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 266772 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 267310 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 267739 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 268320 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 268937 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 269563 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 270147 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 270721 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 271448 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 271471 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 271702 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 274068 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 276458 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 278619 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 281822 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 281845 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 282160 + Length: 18 + ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 282780 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283141 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283196 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283260 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283624 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283679 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283743 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283794 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283831 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283875 + Length: 45 + ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 283957 + Length: 396 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm(*g_cublas_handles[id], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, dst_f16.get(), dpct::library_data_t::real_half, ldc, dpct::library_data_t::real_half))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 285106 + Length: 45 + ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 285188 + Length: 244 + ReplacementText: 'DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(*g_cublas_handles[id], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00, src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]), dst_dd_i, ldc))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 285515 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 285695 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 287917 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 287934 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 288500 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 288517 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 288959 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 290051 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 291125 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 291444 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 291957 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 292569 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 293193 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 293941 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 294213 + Length: 0 + ReplacementText: " /*\n DPCT1010:87: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 294228 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 294485 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 294834 + Length: 0 + ReplacementText: " /*\n DPCT1010:88: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 294849 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 295064 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 296196 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 297085 + Length: 0 + ReplacementText: " /*\n DPCT1010:89: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 297100 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 297204 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(main_stream->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 297256 + Length: 24 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 297280 + Length: 13 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 297294 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 297367 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 297400 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 298826 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 302402 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 303369 + Length: 0 + ReplacementText: " /*\n DPCT1010:90: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 303396 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 304041 + Length: 0 + ReplacementText: " /*\n DPCT1024:91: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 304060 + Length: 86 + ReplacementText: 'DPCT_CHECK_ERROR(*src0_extra->events[g_main_device][0] = g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 305030 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 305215 + Length: 68 + ReplacementText: 'DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier({*src0_extra->events[g_main_device][0]}))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 306734 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 306828 + Length: 78 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 306906 + Length: 8 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 306915 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 307176 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 307252 + Length: 78 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 307330 + Length: 8 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 307339 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 307967 + Length: 0 + ReplacementText: " /*\n DPCT1010:92: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 307998 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 308505 + Length: 0 + ReplacementText: " /*\n DPCT1010:93: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 308532 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 308720 + Length: 14 + ReplacementText: 'dpct::memcpy_direction' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 308884 + Length: 22 + ReplacementText: 'dpct::device_to_host' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 309086 + Length: 24 + ReplacementText: 'dpct::device_to_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310027 + Length: 17 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310203 + Length: 0 + ReplacementText: '*' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310210 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310501 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310566 + Length: 6 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310572 + Length: 8 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310581 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310777 + Length: 0 + ReplacementText: " /*\n DPCT1024:94: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 310808 + Length: 51 + ReplacementText: 'DPCT_CHECK_ERROR(*src0_extra->events[id][is] = stream->ext_oneapi_submit_barrier())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 312109 + Length: 83 + ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier({*src0_extra->events[id][is]}))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 312346 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 312379 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 316249 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 316870 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 317481 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 317590 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 318161 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 318716 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 318774 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 318919 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 318928 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 318982 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319008 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319288 + Length: 0 + ReplacementText: ', const sycl::nd_item<3> &item_ct1' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319310 + Length: 10 + ReplacementText: 'item_ct1.get_group(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319323 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319336 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(2)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319367 + Length: 10 + ReplacementText: 'item_ct1.get_group(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319380 + Length: 10 + ReplacementText: 'item_ct1.get_local_range(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319393 + Length: 11 + ReplacementText: 'item_ct1.get_local_id(1)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 319926 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 320948 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 321026 + Length: 61 + ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 321237 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 321259 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 321748 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 321858 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 321896 + Length: 19 + ReplacementText: 'dpct::library_data_t' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 321958 + Length: 14 + ReplacementText: 'dpct::library_data_t' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 321996 + Length: 10 + ReplacementText: 'dpct::library_data_t::real_half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 322099 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 322133 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 322435 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 322481 + Length: 4 + ReplacementText: 'sycl::half' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 322608 + Length: 10 + ReplacementText: 'dpct::library_data_t::real_float' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 324031 + Length: 613 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm_batch(*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const char *) src0_as_f16, dpct::library_data_t::real_half, nb01/sizeof(sycl::half), src0->nb[2]/sizeof(sycl::half), (const char *) src1_as_f16.get(), dpct::library_data_t::real_half, nb11/sizeof(float), src1->nb[2]/sizeof(float), beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), ne12*ne13, cu_compute_type))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 324853 + Length: 4 + ReplacementText: 'sycl::range<3>' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 324869 + Length: 10 + ReplacementText: 1, ne12, ne13 + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 324882 + Length: 0 + ReplacementText: " /*\n DPCT1049:47: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 324890 + Length: 319 + ReplacementText: "{\n dpct::has_capability_or_fail(main_stream->get_device(), {sycl::aspect::fp16});\n\n main_stream->submit(\n [&](sycl::handler &cgh) {\n const sycl::half * src1_as_f16_get_ct1 = src1_as_f16.get();\n const void ** ptrs_src_get_ct3 = ptrs_src.get();\n void ** ptrs_dst_get_ct4 = ptrs_dst.get();\n\n cgh.parallel_for(\n sycl::nd_range<3>(block_dims, block_dims), \n [=](sycl::nd_item<3> item_ct1) {\n k_compute_batched_ptrs(src0_as_f16, src1_as_f16_get_ct1, dst_t, ptrs_src_get_ct3, ptrs_dst_get_ct4, ne12, ne13, ne23, nb02, nb03, nb12, nb13, nbd2, nbd3, r2, r3, item_ct1);\n });\n });\n }" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: true + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 325209 + Length: 1 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 325211 + Length: 0 + ReplacementText: " /*\n DPCT1010:95: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n */\n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 325230 + Length: 18 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 325282 + Length: 499 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm_batch(*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0*ne23), dpct::library_data_t::real_half, nb01/sizeof(sycl::half), (const void **) (ptrs_src.get() + 1*ne23), dpct::library_data_t::real_half, nb11/sizeof(float), beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01, ne23, cu_compute_type))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 326000 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 336638 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 337017 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 337254 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 337312 + Length: 24 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 337336 + Length: 8 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 337345 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 337367 + Length: 29 + ReplacementText: 'DPCT_CHECK_ERROR(stream->wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 339747 + Length: 14 + ReplacementText: 'dpct::memcpy_direction' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 339822 + Length: 22 + ReplacementText: 'dpct::host_to_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 339847 + Length: 24 + ReplacementText: 'dpct::device_to_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 339887 + Length: 14 + ReplacementText: 'dpct::memcpy_direction' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 339962 + Length: 22 + ReplacementText: 'dpct::device_to_host' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 339987 + Length: 24 + ReplacementText: 'dpct::device_to_device' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 340538 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 340668 + Length: 11 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 340679 + Length: 8 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 340688 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 341614 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 341739 + Length: 10 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 341749 + Length: 8 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 341758 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 341888 + Length: 29 + ReplacementText: 'DPCT_CHECK_ERROR(stream->wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 341927 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 342360 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 343164 + Length: 12 + ReplacementText: 'dpct::queue_ptr' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 344994 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 347078 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 348753 + Length: 22 + ReplacementText: 'DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(size, dpct::get_in_order_queue()))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 348950 + Length: 10 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349006 + Length: 0 + ReplacementText: '.wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349039 + Length: 10 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349078 + Length: 24 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349103 + Length: 0 + ReplacementText: '.wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349281 + Length: 72 + ReplacementText: 'DPCT_CHECK_ERROR(extra->events[id][is] = new sycl::event())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349415 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349471 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349874 + Length: 32 + ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(extra->data_device[id], dpct::get_in_order_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 349906 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 350108 + Length: 39 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::destroy_event(extra->events[id][is]))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 350200 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 350900 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 353118 + Length: 33 + ReplacementText: 'DPCT_CHECK_ERROR(data = (char *)sycl::malloc_device(g_scratch_size, dpct::get_in_order_queue()))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 353512 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(size, dpct::get_in_order_queue()))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 353557 + Length: 10 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 353582 + Length: 0 + ReplacementText: '.wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 353755 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 353838 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 353996 + Length: 45 + ReplacementText: 'DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(g_scratch_size, dpct::get_in_order_queue()))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 354826 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 354887 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 355134 + Length: 10 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 355213 + Length: 24 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 355238 + Length: 0 + ReplacementText: '.wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 355242 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 355840 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356198 + Length: 14 + ReplacementText: 'dpct::device_info' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356238 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356262 + Length: 5 + ReplacementText: prop + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356269 + Length: 13 + ReplacementText: 'dpct::dev_mgr::instance().get_device(g_main_device)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356283 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356385 + Length: 4 + ReplacementText: 'get_name()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356399 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356823 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356902 + Length: 26 + ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(g_scratch_buffer, dpct::get_in_order_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356928 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 356964 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362011 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362043 + Length: 33 + ReplacementText: 'DPCT_CHECK_ERROR(device_count = dpct::dev_mgr::instance().device_count())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362080 + Length: 11 + ReplacementText: '0' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362145 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362242 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362248 + Length: 14 + ReplacementText: 'dpct::device_info' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362284 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362308 + Length: 5 + ReplacementText: prop + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362315 + Length: 6 + ReplacementText: 'dpct::dev_mgr::instance().get_device(device)' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362322 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362380 + Length: 4 + ReplacementText: 'get_name()' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 362388 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 363487 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 363602 + Length: 22 + ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 363624 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 363644 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 363953 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 364993 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365077 + Length: 31 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365109 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365150 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365299 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365509 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365551 + Length: 10 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365603 + Length: 24 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365628 + Length: 0 + ReplacementText: '.wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365632 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365781 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 365991 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366033 + Length: 10 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366091 + Length: 24 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366116 + Length: 0 + ReplacementText: '.wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366120 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366210 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366365 + Length: 23 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366407 + Length: 10 + ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366452 + Length: 0 + ReplacementText: '.wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 366456 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 367132 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 367331 + Length: 26 + ReplacementText: 'DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(size, dpct::get_in_order_queue()))' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 367548 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 371821 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372103 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372160 + Length: 24 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372184 + Length: 36 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372221 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372225 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372367 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372649 + Length: 15 + ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372712 + Length: 24 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372736 + Length: 36 + ReplacementText: '' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372773 + Length: 0 + ReplacementText: ')' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372777 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372845 + Length: 0 + ReplacementText: ' try ' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 372953 + Length: 57 + ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait())' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Offset: 373036 + Length: 0 + ReplacementText: "\ncatch (sycl::exception const &exc) {\n std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n std::exit(1);\n}" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false +MainSourceFilesDigest: + - MainSourceFile: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu' + Digest: fe16d2da27d2d01e9e6dcb75ef2d0692 +DpctVersion: 18.0.0 +MainHelperFileName: '' +USMLevel: '' +FeatureMap: {} +CompileTargets: {} +OptionMap: + AnalysisScopePath: + Value: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub' + Specified: false + AsyncHandler: + Value: 'false' + Specified: false + CommentsEnabled: + Value: 'false' + Specified: false + CompilationsDir: + Value: '' + Specified: false + CtadEnabled: + Value: 'false' + Specified: false + EnablepProfiling: + Value: 'true' + Specified: true + ExperimentalFlag: + Value: '0' + Specified: false + ExplicitClNamespace: + Value: 'false' + Specified: false + ExplicitNamespace: + Value: '20' + Specified: false + ExtensionDDFlag: + Value: '0' + Specified: false + ExtensionDEFlag: + Value: '4294967295' + Specified: false + HelperFuncPreferenceFlag: + Value: '0' + Specified: false + NDRangeDim: + Value: '3' + Specified: false + NoDRYPattern: + Value: 'false' + Specified: false + NoUseGenericSpace: + Value: '' + Specified: true + OptimizeMigration: + Value: 'false' + Specified: false + ProcessAll: + Value: 'false' + Specified: false + RuleFile: + Value: '' + Specified: false + SyclNamedLambda: + Value: 'false' + Specified: false + UsmLevel: + Value: '1' + Specified: false +... diff --git a/dpcpp_out2/ggml-alloc.h b/dpcpp_out2/ggml-alloc.h new file mode 100644 index 0000000000000..64a412468915b --- /dev/null +++ b/dpcpp_out2/ggml-alloc.h @@ -0,0 +1,92 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ggml_backend; +struct ggml_backend_buffer; +struct ggml_backend_buffer_type; + +// +// Legacy API +// + +typedef struct ggml_allocr * ggml_allocr_t; + +// initialize allocator for use with CPU backend only +GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment); +GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment); + +// initialize allocator for use with ggml-backend +GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer); +GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer +GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend); + +GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc); + +// tell the allocator to parse nodes following the order described in the list +// you should call this if your graph are optimized to execute out-of-order +GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n); + +GGML_API void ggml_allocr_free (ggml_allocr_t alloc); +GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc); +GGML_API void ggml_allocr_reset (ggml_allocr_t alloc); +GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor); +GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc); + +GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph); + +// +// ggml-backend v2 API +// + +// Separate tensor and graph allocator objects +// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators +// The original API is kept as a wrapper around the new API + +// Tensor allocator +typedef struct ggml_tallocr * ggml_tallocr_t; + +GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment); +GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment); +GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer); +GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer +GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend); + +GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc); + +GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc); +GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc); +GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc); +GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor); +GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc); + + +// Graph allocator +typedef struct ggml_gallocr * ggml_gallocr_t; + +GGML_API ggml_gallocr_t ggml_gallocr_new(void); +GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); + +GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n); +GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph); + +// Allocate tensors from the allocators given by the hash table +GGML_API void ggml_gallocr_alloc_graph_n( + ggml_gallocr_t galloc, + struct ggml_cgraph * graph, + struct ggml_hash_set hash_set, + ggml_tallocr_t * hash_node_talloc); + + +// Utils +// Create a buffer and allocate all the tensors in a ggml_context +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft); +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend); + +#ifdef __cplusplus +} +#endif diff --git a/dpcpp_out2/ggml-backend-impl.h b/dpcpp_out2/ggml-backend-impl.h new file mode 100644 index 0000000000000..05859935a3c2f --- /dev/null +++ b/dpcpp_out2/ggml-backend-impl.h @@ -0,0 +1,116 @@ +#pragma once + +// ggml-backend internal header + +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + // + // Backend buffer + // + + // buffer type + typedef void * ggml_backend_buffer_type_context_t; + + struct ggml_backend_buffer_type_i { + ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); + size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // check if tensor data is in host memory + // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) + bool (*is_host) (ggml_backend_buffer_type_t buft); + }; + + struct ggml_backend_buffer_type { + struct ggml_backend_buffer_type_i iface; + ggml_backend_buffer_type_context_t context; + }; + + // buffer + typedef void * ggml_backend_buffer_context_t; + + struct ggml_backend_buffer_i { + void (*free_buffer) (ggml_backend_buffer_t buffer); + //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + void * (*get_base) (ggml_backend_buffer_t buffer); + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + // (optional) copy tensor between different buffer-type, allow for single-copy tranfers + void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + }; + + struct ggml_backend_buffer { + struct ggml_backend_buffer_i iface; + ggml_backend_buffer_type_t buft; + ggml_backend_buffer_context_t context; + size_t size; + }; + + ggml_backend_buffer_t ggml_backend_buffer_init( + ggml_backend_buffer_type_t buft, + struct ggml_backend_buffer_i iface, + ggml_backend_buffer_context_t context, + size_t size); + + + // + // Backend + // + + typedef void * ggml_backend_context_t; + + struct ggml_backend_i { + const char * (*get_name)(ggml_backend_t backend); + + void (*free)(ggml_backend_t backend); + + // buffer allocation + ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); + + // (optional) asynchroneous tensor data access + void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + + // (optional) asynchroneous tensor copy + void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + + void (*synchronize)(ggml_backend_t backend); + + // compute graph with a plan + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + + // compute graph without a plan + void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + + // check if the backend supports an operation + bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + }; + + struct ggml_backend { + struct ggml_backend_i iface; + + ggml_backend_context_t context; + }; + + + // + // Backend registry + // + + typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data); + + void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); + +#ifdef __cplusplus +} +#endif diff --git a/dpcpp_out2/ggml-backend.h b/dpcpp_out2/ggml-backend.h new file mode 100644 index 0000000000000..a9d2fddd726a8 --- /dev/null +++ b/dpcpp_out2/ggml-backend.h @@ -0,0 +1,188 @@ +#pragma once + +#include "ggml.h" +#include "ggml-alloc.h" + +#ifdef __cplusplus +extern "C" { +#endif + + typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; + typedef struct ggml_backend_buffer * ggml_backend_buffer_t; + typedef struct ggml_backend * ggml_backend_t; + typedef void * ggml_backend_graph_plan_t; + + // + // Backend buffer + // + + // buffer type + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + + // buffer + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); + + // + // Backend + // + + + GGML_API const char * ggml_backend_name(ggml_backend_t backend); + GGML_API void ggml_backend_free(ggml_backend_t backend); + + GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend); + GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); + GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); + + GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + + GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + + GGML_API void ggml_backend_synchronize(ggml_backend_t backend); + + GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); + + GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); + + // tensor copy between different backends + GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); + GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy + + // + // CPU backend + // + + GGML_API ggml_backend_t ggml_backend_cpu_init(void); + + GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + + // Create a backend buffer from an existing pointer + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + +#ifdef GGML_USE_CPU_HBM + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); +#endif + + // + // Backend registry + // + + // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way + + GGML_API size_t ggml_backend_reg_get_count(void); + GGML_API size_t ggml_backend_reg_find_by_name(const char * name); + GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params] + GGML_API const char * ggml_backend_reg_get_name(size_t i); + GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific + GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); + GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size); + + // + // Backend scheduler + // + + // The backend scheduler allows for multiple backends to be used together + // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends + // The backends are selected based on: + // - the backend that supports the operation + // - the location of the pre-allocated tensors (e.g. the weights) + /* + Example usage: + + sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends); + // sched is initialized with measure allocators and cannot be used until allocated with a measure graph + + // initialize buffers from a measure graph + measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed + + // in build_graph: + build_graph(...) { + // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) + alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu); + ggml_allocr_alloc(alloc_cpu, tensor); + + // manually assigning nodes to a backend (optional, shouldn't be needed in most cases) + struct ggml_tensor * node = ggml_mul_mat(ctx, ...); + ggml_backend_sched_set_node_backend(sched, node, backend_gpu); + } + + // allocate backend buffers from measure graph + ggml_backend_sched_init_measure(sched, measure_graph); + + // the scheduler is now ready to compute graphs + + // compute + graph = build_graph(sched); + ggml_backend_sched_graph_compute(sched, graph); + */ + + struct ggml_backend_sched; + typedef struct ggml_backend_sched * ggml_backend_sched_t; + + // Initialize a backend scheduler + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends); + + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + + // Initialize backend buffers from a measure graph + GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + + GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend); + + GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + + // Allocate a graph on the backend scheduler + GGML_API void ggml_backend_sched_graph_compute( + ggml_backend_sched_t sched, + struct ggml_cgraph * graph); + + + // + // Utils + // + + struct ggml_backend_graph_copy { + ggml_backend_buffer_t buffer; + struct ggml_context * ctx_allocated; + struct ggml_context * ctx_unallocated; + struct ggml_cgraph * graph; + }; + + // Copy a graph to a different backend + GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); + GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); + + typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + + // Compare the output of two backends + GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); + + // Tensor initialization + GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); + GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + + +#ifdef __cplusplus +} +#endif diff --git a/dpcpp_out2/ggml-cuda.dp.cpp b/dpcpp_out2/ggml-cuda.dp.cpp new file mode 100644 index 0000000000000..fc6c68cdcef01 --- /dev/null +++ b/dpcpp_out2/ggml-cuda.dp.cpp @@ -0,0 +1,12724 @@ +#define DPCT_PROFILING_ENABLED +#define DPCT_COMPAT_RT_VERSION 12010 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#ifdef GGML_HIP_UMA +#define cudaMalloc hipMallocManaged +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) +#else +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#endif +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamFireAndForget hipStreamFireAndForget +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define __trap abort +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#else + +#if DPCT_COMPAT_RT_VERSION < 11020 +#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t +#endif // CUDART_VERSION < 11020 + +#endif // defined(GGML_USE_HIPBLAS) + +#include "ggml-cuda.h" +#include "ggml.h" +#include "ggml-backend-impl.h" +#include + +#include + +#define MIN_CC_DP4A 510 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define CC_VOLTA 700 +#define CC_OFFSET_AMD 1000000 +#define CC_RDNA2 (CC_OFFSET_AMD + 1030) + +#define GGML_CUDA_MAX_NODES 8192 + +// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication +// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant +// for large computational tasks. the drawback is that this requires some extra amount of VRAM: +// - 7B quantum model: +100-200 MB +// - 13B quantum model: +200-400 MB +// +//#define GGML_CUDA_FORCE_MMQ + +// TODO: improve this to be correct for more hardware +// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores +// probably other such cases, and not sure what happens on AMD hardware +#if !defined(GGML_CUDA_FORCE_MMQ) +#define CUDA_USE_TENSOR_CORES +#endif + +// max batch size to use MMQ kernels when tensor cores are available +#define MMQ_MAX_BATCH_SIZE 32 + +#if defined(GGML_USE_HIPBLAS) +#define __CUDA_ARCH__ 1300 + +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ + defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3 +#endif + +#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) +#define RDNA2 +#endif + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) +} + +static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(__gfx1100__) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; + asm("\n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + " + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) + : "v"(a), "v"(b) + ); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; +} +#endif // defined(GGML_USE_HIPBLAS) + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +#if DPCT_COMPAT_RT_VERSION >= 12000 + static const char *cublas_get_error_str(const int err) { + /* + DPCT1009:48: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/ + ; + } +#else + static const char * cublas_get_error_str(const cublasStatus_t err) { + switch (err) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; + default: return "unknown error"; + } + } +#endif // CUDART_VERSION >= 12000 + +[[noreturn]] +static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) { + fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg); + fprintf(stderr, " in function %s at %s:%d\n", func, file, line); + GGML_ASSERT(!"CUDA error"); +} + +/* +DPCT1001:50: The statement could not be removed. +*/ +/* +DPCT1000:51: Error handling if-stmt was detected but could not be rewritten. +*/ +/* +DPCT1009:52: SYCL uses exceptions to report errors and does not use the error +codes. The original code was commented out and a warning string was inserted. +You need to rewrite this code. +*/ +#define CUDA_CHECK(err) do { \ + auto err_ = (err); if (err_ != 0) ggml_cuda_error( \ + #err, __func__, __FILE__, __LINE__, \ + "cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/); \ +} while (0) +#define CUBLAS_CHECK(err) \ + do { auto err_ = (err); if (err_ != 0) \ + ggml_cuda_error(#err, __func__, __FILE__, __LINE__, \ + cublas_get_error_str(err_)); } while (0) + +#if !defined(GGML_USE_HIPBLAS) +static const char *cu_get_error_str(int err) { + const char * err_str; + /* + DPCT1007:49: Migration of cuGetErrorString is not supported. + */ + cuGetErrorString(err, &err_str); + return err_str; +} +/* +DPCT1001:67: The statement could not be removed. +*/ +/* +DPCT1000:68: Error handling if-stmt was detected but could not be rewritten. +*/ +#define CU_CHECK(err) \ + do { auto err_ = (err); \ + if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__, \ + cu_get_error_str(err_)); } while (0) +#endif + +#if DPCT_COMPAT_RT_VERSION >= 11100 +#define GGML_CUDA_ASSUME(x) __builtin_assume(x) +#else +#define GGML_CUDA_ASSUME(x) +#endif // CUDART_VERSION >= 11100 + +#ifdef GGML_CUDA_F16 +typedef half dfloat; // dequantize float +typedef half2 dfloat2; +#else +typedef float dfloat; // dequantize float +typedef sycl::float2 dfloat2; +#endif //GGML_CUDA_F16 + +static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) { + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8, + const int &i32) { + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8, + const int &i32) { + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8, + const int &i32) { + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +template +using to_t_cuda_t = void (*)(const void *__restrict__ x, T *__restrict__ y, + int k, dpct::queue_ptr stream); +typedef to_t_cuda_t to_fp32_cuda_t; +typedef to_t_cuda_t to_fp16_cuda_t; + +typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); +typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); +typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +typedef void (*ggml_cuda_op_mul_mat_t)( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream); +typedef void (*ggml_cuda_op_flatten_t)(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream); + +// QK = number of values after dequantization +// QR = QK / number of values before dequantization +// QI = number of 32 bit integers before dequantization + +#define QK4_0 32 +#define QR4_0 2 +#define QI4_0 (QK4_0 / (4 * QR4_0)) +typedef struct dpct_type_471834 { + sycl::half d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); + +#define QK4_1 32 +#define QR4_1 2 +#define QI4_1 (QK4_1 / (4 * QR4_1)) +typedef struct dpct_type_143705 { + sycl::half2 dm; // dm.x = delta, dm.y = min + uint8_t qs[QK4_1 / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); + +#define QK5_0 32 +#define QR5_0 2 +#define QI5_0 (QK5_0 / (4 * QR5_0)) +typedef struct dpct_type_673649 { + sycl::half d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; +static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); + +#define QK5_1 32 +#define QR5_1 2 +#define QI5_1 (QK5_1 / (4 * QR5_1)) +typedef struct dpct_type_135589 { + sycl::half2 dm; // dm.x = delta, dm.y = min + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_1 / 2]; // nibbles / quants +} block_q5_1; +static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); + +#define QK8_0 32 +#define QR8_0 1 +#define QI8_0 (QK8_0 / (4 * QR8_0)) +typedef struct dpct_type_122878 { + sycl::half d; // delta + int8_t qs[QK8_0]; // quants +} block_q8_0; +static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); + +#define QK8_1 32 +#define QR8_1 1 +#define QI8_1 (QK8_1 / (4 * QR8_1)) +typedef struct dpct_type_143721 { + sycl::half2 ds; // ds.x = delta, ds.y = sum + int8_t qs[QK8_0]; // quants +} block_q8_1; +static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding"); + +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); +typedef void (*allocate_tiles_cuda_t)(int **x_ql, sycl::half2 **x_dm, + int **x_qh, int **x_sc); +typedef void (*load_tiles_cuda_t)(const void *__restrict__ vx, + int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, + int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, + const int &i_max, const int &k, + const int &blocks_per_row); +typedef float (*vec_dot_q_mul_mat_cuda_t)( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms, + const int &i, const int &j, const int &k); + +//================================= k-quants + +#ifdef GGML_QKK_64 +#define QK_K 64 +#define K_SCALE_SIZE 4 +#else +#define QK_K 256 +#define K_SCALE_SIZE 12 +#endif + +#define QR2_K 4 +#define QI2_K (QK_K / (4*QR2_K)) +typedef struct dpct_type_619598 { + uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits + uint8_t qs[QK_K/4]; // quants + sycl::half2 dm; // super-block scale for quantized scales/mins +} block_q2_K; +static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); + +#define QR3_K 4 +#define QI3_K (QK_K / (4*QR3_K)) +typedef struct dpct_type_138576 { + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits +#ifdef GGML_QKK_64 + uint8_t scales[2]; // scales, quantized with 8 bits +#else + uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits +#endif + sycl::half d; // super-block scale +} block_q3_K; +//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); + +#define QR4_K 2 +#define QI4_K (QK_K / (4*QR4_K)) +#ifdef GGML_QKK_64 +typedef struct { + half dm[2]; // super-block scales/mins + uint8_t scales[2]; // 4-bit block scales/mins + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding"); +#else +typedef struct dpct_type_154943 { + sycl::half2 dm; // super-block scale for quantized scales/mins + uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits + uint8_t qs[QK_K/2]; // 4--bit quants +} block_q4_K; +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); +#endif + +#define QR5_K 2 +#define QI5_K (QK_K / (4*QR5_K)) +#ifdef GGML_QKK_64 +typedef struct { + half d; // super-block scale + int8_t scales[QK_K/16]; // block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); +#else +typedef struct dpct_type_866817 { + sycl::half2 dm; // super-block scale for quantized scales/mins + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits +} block_q5_K; +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +#endif + +#define QR6_K 2 +#define QI6_K (QK_K / (4*QR6_K)) +typedef struct dpct_type_107281 { + uint8_t ql[QK_K/2]; // quants, lower 4 bits + uint8_t qh[QK_K/4]; // quants, upper 2 bits + int8_t scales[QK_K/16]; // scales + sycl::half d; // delta +} block_q6_K; +static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); + +#define WARP_SIZE 32 +#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses + +#define CUDA_GELU_BLOCK_SIZE 256 +#define CUDA_SILU_BLOCK_SIZE 256 +#define CUDA_TANH_BLOCK_SIZE 256 +#define CUDA_RELU_BLOCK_SIZE 256 +#define CUDA_SQR_BLOCK_SIZE 256 +#define CUDA_CPY_BLOCK_SIZE 32 +#define CUDA_SCALE_BLOCK_SIZE 256 +#define CUDA_CLAMP_BLOCK_SIZE 256 +#define CUDA_ROPE_BLOCK_SIZE 256 +#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 +#define CUDA_ALIBI_BLOCK_SIZE 32 +#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 +#define CUDA_QUANTIZE_BLOCK_SIZE 256 +#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 +#define CUDA_GET_ROWS_BLOCK_SIZE 256 +#define CUDA_UPSCALE_BLOCK_SIZE 256 +#define CUDA_CONCAT_BLOCK_SIZE 256 +#define CUDA_PAD_BLOCK_SIZE 256 +#define CUDA_ACC_BLOCK_SIZE 256 +#define CUDA_IM2COL_BLOCK_SIZE 256 + +// dmmv = dequantize_mul_mat_vec +#ifndef GGML_CUDA_DMMV_X +#define GGML_CUDA_DMMV_X 32 +#endif +#ifndef GGML_CUDA_MMV_Y +#define GGML_CUDA_MMV_Y 1 +#endif + +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 2 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + +#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE +#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128 +#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE + +#define MUL_MAT_SRC1_COL_STRIDE 128 + +#define MAX_STREAMS 8 +static dpct::queue_ptr g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { + {&dpct::get_in_order_queue()}}; + +struct ggml_tensor_extra_gpu { + void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors + dpct::event_ptr + events[GGML_CUDA_MAX_DEVICES] + [MAX_STREAMS]; // events for synchronizing multiple GPUs +}; + +// this is faster on Windows +// probably because the Windows CUDA libraries forget to make this check before invoking the drivers +inline dpct::err0 ggml_cuda_set_device(const int device) try { + int current_device; + CUDA_CHECK(DPCT_CHECK_ERROR( + current_device = dpct::dev_mgr::instance().current_device_id())); + + if (device == current_device) { + return 0; + } + + /* + DPCT1093:53: The "device" device may be not the one intended for use. Adjust + the selected device if needed. + */ + return DPCT_CHECK_ERROR(dpct::select_device(device)); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static int g_device_count = -1; +static int g_main_device = 0; +static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; + +struct cuda_device_capabilities { + int cc; // compute capability + bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory +}; + +static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} }; + + +static void * g_scratch_buffer = nullptr; +static size_t g_scratch_size = 0; // disabled by default +static size_t g_scratch_offset = 0; + +static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + +[[noreturn]] +static void bad_arch(const sycl::stream &stream_ct1) { + stream_ct1 << "ERROR: ggml-cuda was compiled without support for the " + "current GPU architecture.\n"; + __trap(); + + (void) bad_arch; // suppress unused function warning +} + +static __dpct_inline__ float warp_reduce_sum(float x, + const sycl::nd_item<3> &item_ct1) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1096:98: The right-most dimension of the work-group used in the SYCL + kernel that calls this function may be less than "32". The function + "dpct::permute_sub_group_by_xor" may return an unexpected result on the + CPU device. Modify the size of the work-group to ensure that the value + of the right-most dimension is a multiple of "32". + */ + x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask); + } + return x; +} + +static __dpct_inline__ sycl::float2 +warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), + mask); + a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), + mask); + } + return a; +} + +static __dpct_inline__ float warp_reduce_max(float x, + const sycl::nd_item<3> &item_ct1) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + /* + DPCT1096:97: The right-most dimension of the work-group used in the SYCL + kernel that calls this function may be less than "32". The function + "dpct::permute_sub_group_by_xor" may return an unexpected result on the + CPU device. Modify the size of the work-group to ensure that the value + of the right-most dimension is a multiple of "32". + */ + x = sycl::fmax(x, dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), x, mask)); + } + return x; +} + +static __dpct_inline__ float op_repeat(const float a, const float b) { + return b; +} + +static __dpct_inline__ float op_add(const float a, const float b) { + return a + b; +} + +static __dpct_inline__ float op_mul(const float a, const float b) { + return a * b; +} + +static __dpct_inline__ float op_div(const float a, const float b) { + return a / b; +} + +template +static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) / + ne3; + const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) % + ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; + i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); + } +} + +template +static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + const int i3 = i/(ne2*ne1*ne0); + const int i2 = (i/(ne1*ne0)) % ne2; + const int i1 = (i/ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); +} + +static void acc_f32(const float * x, const float * y, float * dst, const int ne, + const int ne10, const int ne11, const int ne12, + const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= ne) { + return; + } + int src1_idx = i - offset; + int oz = src1_idx / nb2; + int oy = (src1_idx - (oz * nb2)) / nb1; + int ox = src1_idx % nb1; + if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { + dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; + } else { + dst[i] = x[i]; + } +} + +static void gelu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + float xi = x[i]; + dst[i] = 0.5f * xi * + (1.0f + + sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi))); +} + +static void silu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i])); +} + +static void gelu_quick_f32(const float *x, float *dst, int k, + const sycl::nd_item<3> &item_ct1) { + const float GELU_QUICK_COEF = -1.702f; + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i]))); +} + +static void tanh_f32(const float *x, float *dst, int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = sycl::tanh((float)(x[i])); +} + +static void relu_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::fmax((float)(x[i]), (float)0); +} + +static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + if (i >= k) { + return; + } + dst[i] = sycl::fmax((float)(x[i]), (float)0) + + sycl::fmin((float)(x[i]), 0.0f) * negative_slope; +} + +static void sqr_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] * x[i]; +} + +template +static void norm_f32(const float * x, float * dst, const int ncols, const float eps, + const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + + sycl::float2 mean_var = sycl::float2(0.f, 0.f); + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row*ncols + col]; + mean_var.x() += xi; + mean_var.y() += xi * xi; + } + + // sum up partial sums + mean_var = warp_reduce_sum(mean_var, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = mean_var; + } + /* + DPCT1118:0: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + mean_var = s_sum[lane_id]; + mean_var = warp_reduce_sum(mean_var, item_ct1); + } + + const float mean = mean_var.x() / ncols; + const float var = mean_var.y() / ncols - mean * mean; + const float inv_std = sycl::rsqrt(var + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; + } +} + +static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (item_ct1.get_group(0) < ne02) { // src0 + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + + (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = y[offset_src]; + } +} + +static void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor, + const sycl::nd_item<3> &item_ct1) { + int ne0 = ne00 * scale_factor; + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int i00 = nidx / scale_factor; + int i01 = item_ct1.get_group(1) / scale_factor; + int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02; + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; +} + +static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (nidx < ne00 && item_ct1.get_group(1) < ne01 && + item_ct1.get_group(0) < ne02) { + int offset_src = nidx + item_ct1.get_group(1) * ne00 + + item_ct1.get_group(0) * ne00 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + dst[offset_dst] = 0.0f; + } +} + +template +static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps, + const sycl::nd_item<3> &item_ct1, float *s_sum) { + int start = item_ct1.get_group(2) * group_size; + int end = start + group_size; + + start += item_ct1.get_local_id(2); + + if (end >= ne_elements) { + end = ne_elements; + } + + float tmp = 0.0f; // partial sum for thread in warp + + for (int j = start; j < end; j += block_size) { + tmp += x[j]; + } + + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:1: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:54: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + float mean = tmp / group_size; + tmp = 0.0f; + + for (int j = start; j < end; j += block_size) { + float xi = x[j] - mean; + dst[j] = xi; + tmp += xi * xi; + } + + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:2: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:55: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + float variance = tmp / group_size; + float scale = sycl::rsqrt(variance + eps); + for (int j = start; j < end; j += block_size) { + dst[j] *= scale; + } +} + +template +static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps, + const sycl::nd_item<3> &item_ct1, float *s_sum) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row*ncols + col]; + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:3: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + const float mean = tmp / ncols; + const float scale = sycl::rsqrt(mean + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = scale * x[row*ncols + col]; + } +} + +static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = x[ib].d; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {8.0f, 8.0f}); + v = __hmul2(v, {d, d}); +#else + v.x() = (v.x() - 8.0f) * d; + v.y() = (v.y() - 8.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x() = (v.x() * d) + m; + v.y() = (v.y() * d) + m; +#endif // GGML_CUDA_F16 +} + +static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q5_0 * x = (const block_q5_0 *) vx; + + const dfloat d = x[ib].d; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {16.0f, 16.0f}); + v = __hmul2(v, {d, d}); +#else + v.x() = (v.x() - 16.0f) * d; + v.y() = (v.y() - 16.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q5_1 * x = (const block_q5_1 *) vx; + + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x() = (v.x() * d) + m; + v.y() = (v.y() * d) + m; +#endif // GGML_CUDA_F16 +} + +static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib, + const int iqs, dfloat2 &v) { + const block_q8_0 * x = (const block_q8_0 *) vx; + + const dfloat d = x[ib].d; + + v.x() = x[ib].qs[iqs + 0]; + v.y() = x[ib].qs[iqs + 1]; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); +#else + v.x() *= d; + v.y() *= d; +#endif // GGML_CUDA_F16 +} + +//================================== k-quants + +template +static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_q2_K * x = (const block_q2_K *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int n = tid/32; + const int l = tid - 32*n; + const int is = 8*n + l/16; + + const uint8_t q = x[i].qs[32*n + l]; + dst_t * y = yy + i*QK_K + 128*n; + + float dall = x[i].dm[0]; + float dmin = x[i].dm[1]; + y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); + y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); + y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); +#else + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const uint8_t q = x[i].qs[il] >> (2*is); + dst_t * y = yy + i*QK_K + 16*is + il; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); + y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); +#endif + +} + +template +static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_q3_K * x = (const block_q3_K *) vx; + +#if QK_K == 256 + const int r = item_ct1.get_local_id(2) / 4; + const int tid = r/2; + const int is0 = r%2; + const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4); + const int n = tid / 4; + const int j = tid - 4*n; + + uint8_t m = 1 << (4*n + j); + int is = 8*n + 2*j + is0; + int shift = 2*j; + + int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : + (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); + float d_all = x[i].d; + float dl = d_all * (us - 32); + + dst_t * y = yy + i*QK_K + 128*n + 32*j; + const uint8_t * q = x[i].qs + 32*n; + const uint8_t * hm = x[i].hmask; + + for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +#else + const int tid = threadIdx.x; + const int is = tid/16; // 0 or 1 + const int il = tid%16; // 0...15 + const int im = il/8; // 0...1 + const int in = il%8; // 0...7 + + dst_t * y = yy + i*QK_K + 16*is + il; + + const uint8_t q = x[i].qs[il] >> (2*is); + const uint8_t h = x[i].hmask[in] >> (2*is + im); + const float d = (float)x[i].d; + + if (is == 0) { + y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } else { + y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } +#endif + +} + +#if QK_K == 256 +static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { + if (j < 4) { + d = q[j] & 63; m = q[j + 4] & 63; + } else { + d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} +#endif + +template +static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q4_K * x = (const block_q4_K *) vx; + + const int i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 32 threads + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; + const int ir = tid%8; + const int is = 2*il; + const int n = 4; + + dst_t * y = yy + i*QK_K + 64*il + n*ir; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint8_t * q = x[i].qs + 32*il + n*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + for (int l = 0; l < n; ++l) { + y[l + 0] = d1 * (q[l] & 0xF) - m1; + y[l +32] = d2 * (q[l] >> 4) - m2; + } +#else + const int tid = threadIdx.x; + const uint8_t * q = x[i].qs; + dst_t * y = yy + i*QK_K; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); + y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); +#endif +} + +template +static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q5_K * x = (const block_q5_K *) vx; + + const int i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 64 threads - this is very slightly better than the one below + const int tid = item_ct1.get_local_id(2); + const int il = tid/16; // il is in 0...3 + const int ir = tid%16; // ir is in 0...15 + const int is = 2*il; // is is in 0...6 + + dst_t * y = yy + i*QK_K + 64*il + 2*ir; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint8_t * ql = x[i].qs + 32*il + 2*ir; + const uint8_t * qh = x[i].qh + 2*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + + uint8_t hm = 1 << (2*il); + y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; + y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; +#else + const int tid = threadIdx.x; + const uint8_t q = x[i].qs[tid]; + const int im = tid/8; // 0...3 + const int in = tid%8; // 0...7 + const int is = tid/16; // 0 or 1 + const uint8_t h = x[i].qh[in] >> im; + const float d = x[i].d; + dst_t * y = yy + i*QK_K + tid; + y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); + y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); +#endif +} + +template +static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q6_K * x = (const block_q6_K *) vx; + + const int i = item_ct1.get_group(2); +#if QK_K == 256 + + // assume 64 threads - this is very slightly better than the one below + const int tid = item_ct1.get_local_id(2); + const int ip = tid/32; // ip is 0 or 1 + const int il = tid - 32*ip; // 0...32 + const int is = 8*ip + il/16; + + dst_t * y = yy + i*QK_K + 128*ip + il; + + const float d = x[i].d; + + const uint8_t * ql = x[i].ql + 64*ip + il; + const uint8_t qh = x[i].qh[32*ip + il]; + const int8_t * sc = x[i].scales + is; + + y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +#else + + // assume 32 threads + const int tid = threadIdx.x; + const int ip = tid/16; // 0 or 1 + const int il = tid - 16*ip; // 0...15 + + dst_t * y = yy + i*QK_K + 16*ip + il; + + const float d = x[i].d; + + const uint8_t ql = x[i].ql[16*ip + il]; + const uint8_t qh = x[i].qh[il] >> (2*ip); + const int8_t * sc = x[i].scales; + + y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); +#endif +} + +/* +DPCT1110:4: The total declared local variable size in device function +dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q2_K * x = (const block_q2_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 16/K_QUANTS_PER_ITERATION; + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; + + uint32_t uaux[2]; + const uint8_t * d = (const uint8_t *)uaux; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint32_t * s = (const uint32_t *)x[i].scales; + + uaux[0] = s[0] & 0x0f0f0f0f; + uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; + + const float2 dall = __half22float2(x[i].dm); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t ql = q[l]; + sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + + y[l+16] * d[1] * ((ql >> 2) & 3) + + y[l+32] * d[2] * ((ql >> 4) & 3) + + y[l+48] * d[3] * ((ql >> 6) & 3); + sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; + } + tmp += dall.x * sum1 - dall.y * sum2; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:5: The total declared local variable size in device function +dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint8_t * s = x[i].scales; + + const float dall = (float)x[i].d; + + float sum = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t hl = x[i].hmask[im+l] >> in; + const uint8_t ql = q[l]; + sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) + + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) + + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) + + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:6: The total declared local variable size in device function +dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; + s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - + dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif + + } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + + const int step = tid * K_QUANTS_PER_ITERATION; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + float tmp = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const float * y = yy + i*QK_K + step; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) + + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); + } + tmp += sum; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:7: The total declared local variable size in device function +dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = item_ct1.get_local_id(2) / 2; // 0...15 + const int ix = item_ct1.get_local_id(2) % 2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x() += + y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + + y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); + sum.y() += + y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + + y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); + sum.z() += + y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + + y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); + sum.w() += + y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + + y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + + sum.w() * sc[5]) - + dmin * smin; + } + +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + +#if QK_K == 256 + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3 + + const int step = tid * K_QUANTS_PER_ITERATION; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + step; + const uint8_t * ql = x[i].ql + step; + const uint8_t * qh = x[i].qh + step; + const int8_t * s = x[i].scales; + + const float d = x[i+0].d; + + float sum = 0; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) + + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); + } + tmp += sum; + + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const sycl::half *x = (const sycl::half *)vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){ + const float * x = (const float *) vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded, + const sycl::nd_item<3> &item_ct1) { + const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (ix >= kx_padded) { + return; + } + + const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + + const int i_padded = iy*kx_padded + ix; + + block_q8_1 * y = (block_q8_1 *) vy; + + const int ib = i_padded / QK8_1; // block index + const int iqs = i_padded % QK8_1; // quant index + + const float xi = ix < kx ? x[iy*kx + ix] : 0.0f; + float amax = sycl::fabs((float)xi); + float sum = xi; + +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), amax, mask)); + sum += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask); + } + + const float d = amax / 127; + const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d); + + y[ib].qs[iqs] = q; + + if (iqs > 0) { + return; + } + + reinterpret_cast(y[ib].ds.x()) = d; + reinterpret_cast(y[ib].ds.y()) = sum; +} + +template +static void k_get_rows( + const void * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { + + const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2)) * + 2; + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) / + ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) % + ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; + + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index + const int iybs = i00 - i00%qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(src0_row, ib, iqs, v); + + dst_row[iybs + iqs + 0] = v.x(); + dst_row[iybs + iqs + y_offset] = v.y(); +} + +template +static void k_get_rows_float( + const src0_t * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { + + const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) / + ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) % + ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); + + dst_row[i00] = src0_row[i00]; +} + +template +static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + 2 * item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + const int ib = i/qk; // block index + const int iqs = (i%qk)/qr; // quant index + const int iybs = i - i%qk; // y block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(vx, ib, iqs, v); + + y[iybs + iqs + 0] = v.x(); + y[iybs + iqs + y_offset] = v.y(); +} + +// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called +// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q + +#define VDR_Q4_0_Q8_1_MMVQ 2 +#define VDR_Q4_0_Q8_1_MMQ 4 + +template +static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u, + const float &d4, + const sycl::half2 &ds8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); + } + + const sycl::float2 ds8f = + ds8.convert(); + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y()); +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_1_Q8_1_MMVQ 2 +#define VDR_Q4_1_Q8_1_MMQ 4 + +template +static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u, + const sycl::half2 &dm4, + const sycl::half2 &ds8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi); + sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm4, ds8)); + const float d4d8 = tmp.x; + const float m4s8 = tmp.y; +#else + const sycl::float2 dm4f = + dm4.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d4d8 = dm4f.x() * ds8f.x(); + const float m4s8 = dm4f.y() * ds8f.y(); +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_0_Q8_1_MMVQ 2 +#define VDR_Q5_0_Q8_1_MMQ 4 + +template +static __dpct_inline__ float +vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u, + const float &d5, const sycl::half2 &ds8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = dpct::dp4a(vi0, u[2 * i + 0], + sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = dpct::dp4a(vi1, u[2 * i + 1], + sumi); // SIMD dot product of quantized values + } + + const sycl::float2 ds8f = + ds8.convert(); + + // second part effectively subtracts 16 from each quant value + return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y()); +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_1_Q8_1_MMVQ 2 +#define VDR_Q5_1_Q8_1_MMQ 4 + +template +static __dpct_inline__ float +vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u, + const sycl::half2 &dm5, const sycl::half2 &ds8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = dpct::dp4a(vi0, u[2 * i + 0], + sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = dpct::dp4a(vi1, u[2 * i + 1], + sumi); // SIMD dot product of quantized values + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm5, ds8)); + const float d5d8 = tmp.x; + const float m5s8 = tmp.y; +#else + const sycl::float2 dm5f = + dm5.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d5d8 = dm5f.x() * ds8f.x(); + const float m5s8 = dm5f.y() * ds8f.y(); +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it + return sumi*d5d8 + m5s8 / (QI5_1 / vdr); + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q8_0_Q8_1_MMVQ 2 +#define VDR_Q8_0_Q8_1_MMQ 8 + +template +static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u, + const float &d8_0, + const float &d8_1) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = dpct::dp4a(v[i], u[i], sumi); + } + + return d8_0*d8_1 * sumi; +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +template +static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u, + const sycl::half2 &dm8, + const sycl::half2 &ds8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = dpct::dp4a(v[i], u[i], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm8, ds8)); + const float d8d8 = tmp.x; + const float m8s8 = tmp.y; +#else + const sycl::float2 dm8f = + dm8.convert(); + const sycl::float2 ds8f = + ds8.convert(); + const float d8d8 = dm8f.x() * ds8f.x(); + const float m8s8 = dm8f.y() * ds8f.y(); +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it + return sumi*d8d8 + m8s8 / (QI8_1 / vdr); +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q2_K_Q8_1_MMVQ 1 +#define VDR_Q2_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq( + const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales, + const sycl::half2 &dm2, const float *__restrict__ d8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += + d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + sumf_m += d8[i] * + dpct::dp4a( + m, u[i], + 0); // multiply constant q2_K part with sum of q8_1 values + } + + const sycl::float2 dm2f = + dm2.convert(); + + return dm2f.x() * sumf_d - dm2f.y() * sumf_m; +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ scales, + const sycl::half2 &dm2, const float &d8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi_d = 0; + int sumi_m = 0; + +#pragma unroll + for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { + int sumi_d_sc = 0; + + const int sc = scales[i0 / (QI8_1/2)]; + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + +#pragma unroll + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = dpct::dp4a(m, u[i], + sumi_m); // multiply sum of q8_1 values with m + } + + sumi_d += sumi_d_sc * (sc & 0xF); + } + + const sycl::float2 dm2f = + dm2.convert(); + + return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m); +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q3_K_Q8_1_MMVQ 1 +#define VDR_Q3_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq( + const int &vl, const int &vh, const int *__restrict__ u, + const uint8_t *__restrict__ scales, const int &scale_offset, + const float &d3, const float *__restrict__ d8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = + dpct::vectorized_binary(vil, vih, dpct::sub_sat()); + + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d3 * sumf; +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d3, + const float &d8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { + int sumi_sc = 0; + + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product + } + + sumi += sumi_sc * scales[i0 / (QI8_1/2)]; + } + + return d3*d8 * sumi; +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_K_Q8_1_MMVQ 2 +#define VDR_Q4_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const float *__restrict__ d8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = + dpct::dp4a(v1i, u[2 * i + 1], + dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product + const int dot2 = + dpct::dp4a(0x01010101, u[2 * i + 1], + dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + const sycl::float2 dm4f = + dm4.convert(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F, + u[i * QI8_1 + j], sumi_d); // SIMD dot product + } + + const sycl::float2 ds8f = + ds8[i].convert(); + + sumf_d += ds8f.x() * (sc[i] * sumi_d); + sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val + } + + const sycl::float2 dm4f = + dm4.convert(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_K_Q8_1_MMVQ 2 +#define VDR_Q5_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq( + const int *__restrict__ vl, const int *__restrict__ vh, + const int *__restrict__ u, const uint8_t *__restrict__ sc, + const uint8_t *__restrict__ m, const sycl::half2 &dm5, + const float *__restrict__ d8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; + + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; + + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; + + const int dot1 = + dpct::dp4a(v0i, u[2 * i + 0], + dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product + const int dot2 = + dpct::dp4a(0x01010101, u[2 * i + 0], + dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); + + } + + const sycl::float2 dm5f = + dm5.convert(); + + return dm5f.x() * sumf_d - dm5f.y() * sumf_m; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int *__restrict__ v, const int *__restrict__ u, + const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m, + const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j], + sumi_d); // SIMD dot product + } + + const sycl::float2 ds8f = + ds8[i].convert(); + + sumf_d += ds8f.x() * (sc[i] * sumi_d); + sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val + } + + const sycl::float2 dm4f = + dm4.convert(); + + return dm4f.x() * sumf_d - dm4f.y() * sumf_m; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q6_K_Q8_1_MMVQ 1 +#define VDR_Q6_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __dpct_inline__ float +vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, + const int *__restrict__ u, + const int8_t *__restrict__ scales, const float &d, + const float *__restrict__ d8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = dpct::vectorized_binary( + (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32 + + sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __dpct_inline__ float +vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u, + const int8_t *__restrict__ sc, const float &d6, + const float *__restrict__ d8) { + +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { + sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + +#pragma unroll + for (int i = i0; i < i0 + 2; ++i) { + sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0], + sumi_d.x()); // SIMD dot product + sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1], + sumi_d.x()); // SIMD dot product + + sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4], + sumi_d.y()); // SIMD dot product + sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5], + sumi_d.y()); // SIMD dot product + } + + sumf_d += d8[i0 / 4] * + (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y()); + } + + return d6 * sumf_d; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __dpct_inline__ float +vec_dot_q4_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; + + int v[VDR_Q4_0_Q8_1_MMVQ]; + int u[2*VDR_Q4_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + } + + return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); +} + +template +static __dpct_inline__ void +allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q4_0, float *tile_x_d_q4_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q4_0; + *x_dm = (sycl::half2 *)tile_x_d_q4_0; +} + +template +static __dpct_inline__ void +load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_0; + const int kqsx = k % QI4_0; + + const block_q4_0 * bx0 = (const block_q4_0 *) vx; + + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const float * x_dmf = (const float *) x_dm; + + int u[2*VDR_Q4_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + } + + return vec_dot_q4_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +static __dpct_inline__ float +vec_dot_q4_1_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; + + int v[VDR_Q4_1_Q8_1_MMVQ]; + int u[2*VDR_Q4_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + } + + return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); +} + +template +static __dpct_inline__ void +allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q4_1; + *x_dm = tile_x_dm_q4_1; +} + +template +static __dpct_inline__ void +load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_1; + const int kqsx = k % QI4_1; + + const block_q4_1 * bx0 = (const block_q4_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { + int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + } +} + +static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + + int u[2*VDR_Q4_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + } + + return vec_dot_q4_1_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +static __dpct_inline__ float +vec_dot_q5_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; + + int vl[VDR_Q5_0_Q8_1_MMVQ]; + int vh[VDR_Q5_0_Q8_1_MMVQ]; + int u[2*VDR_Q5_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); + vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + } + + return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); +} + +template +static __dpct_inline__ void +allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_0, float *tile_x_d_q5_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_ql_q5_0; + *x_dm = (sycl::half2 *)tile_x_d_q5_0; +} + +template +static __dpct_inline__ void +load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_0; + const int kqsx = k % QI5_0; + + const block_q5_0 * bx0 = (const block_q5_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8(bxi->qs, kqsx); + const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + qs0 = dpct::vectorized_binary( + qs0, 0x10101010, dpct::sub_sat()); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + qs1 = dpct::vectorized_binary( + qs1, 0x10101010, dpct::sub_sat()); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { + int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + int u[2*VDR_Q5_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + } + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +static __dpct_inline__ float +vec_dot_q5_1_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; + + int vl[VDR_Q5_1_Q8_1_MMVQ]; + int vh[VDR_Q5_1_Q8_1_MMVQ]; + int u[2*VDR_Q5_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); + vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + } + + return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); +} + +template +static __dpct_inline__ void +allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_ql_q5_1; + *x_dm = tile_x_dm_q5_1; +} + +template +static __dpct_inline__ void +load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_1; + const int kqsx = k % QI5_1; + + const block_q5_1 * bx0 = (const block_q5_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { + int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + } +} + +static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; + + int u[2*VDR_Q5_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + } + + return vec_dot_q8_1_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +static __dpct_inline__ float +vec_dot_q8_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; + + int v[VDR_Q8_0_Q8_1_MMVQ]; + int u[VDR_Q8_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_int8(bq8_0->qs, iqs + i); + u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + } + + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, + bq8_1->ds[0]); +} + +template +static __dpct_inline__ void +allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_qs_q8_0, float *tile_x_d_q8_0) { + (void)x_qh; (void)x_sc; + + *x_ql = tile_x_qs_q8_0; + *x_dm = (sycl::half2 *)tile_x_d_q8_0; +} + +template +static __dpct_inline__ void +load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; (void)x_sc; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI8_0; + const int kqsx = k % QI8_0; + float * x_dmf = (float *) x_dm; + + const block_q8_0 * bx0 = (const block_q8_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } +} + +static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; (void)x_sc; + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], + y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); +} + +static __dpct_inline__ float +vec_dot_q2_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const uint8_t * scales = bq2_K->scales + scale_offset; + + const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + int u[QR2_K]; + float d8[QR2_K]; + +#pragma unroll + for (int i = 0; i < QR2_K; ++ i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds[0]; + } + + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); +} + +template +static __dpct_inline__ void +allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K, + int *tile_x_sc_q2_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q2_K; + *x_dm = tile_x_dm_q2_K; + *x_sc = tile_x_sc_q2_K; +} + +template +static __dpct_inline__ void +load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI2_K; + const int kqsx = k % QI2_K; + + const block_q2_K * bx0 = (const block_q2_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + } +} + +static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const int kbx = k / QI2_K; + const int ky = (k % QI2_K) * QR2_K; + const float * y_df = (const float *) y_ds; + + int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; + + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); + +#pragma unroll + for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { + v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; + } + + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); +} + +static __dpct_inline__ float +vec_dot_q3_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const float d = bq3_K->d; + + const int vl = get_int_from_uint8(bq3_K->qs, iqs); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + i].ds[0]; + } + + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); +} + +template +static __dpct_inline__ void +allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K, + int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) { + + *x_ql = tile_x_ql_q3_K; + *x_dm = tile_x_dm_q3_K; + *x_qh = tile_x_qh_q3_K; + *x_sc = tile_x_sc_q3_K; +} + +template +static __dpct_inline__ void +load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI3_K; + const int kqsx = k % QI3_K; + + const block_q3_K * bx0 = (const block_q3_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { + int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + + const int ksc = k % (QI3_K/4); + + const int ksc_low = ksc % (QI3_K/8); + const int shift_low = 4 * (ksc / (QI3_K/8)); + const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; + + const int ksc_high = QI3_K/8; + const int shift_high = 2 * ksc; + const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; + + const int sc = dpct::vectorized_binary( + sc_low | sc_high, 0x20202020, dpct::sub_sat()); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; + } +} + +static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + + const int kbx = k / QI3_K; + const int ky = (k % QI3_K) * QR3_K; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + + int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int shift = 2 * ((ky % 32) / 8); + const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; + + const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vlh = (vh << 2) & 0x04040404; + + v[l] = dpct::vectorized_binary(vll, vlh, dpct::sub_sat()); + } + + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); +} + +static __dpct_inline__ float +vec_dot_q4_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#ifndef GGML_QKK_64 + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + + // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); + + // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 + // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 + // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 + // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 + + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + v[0] = q4[0]; + v[1] = q4[4]; + + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int i = 0; i < QR4_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = bq8i->ds[0]; + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); + +#else + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + const uint16_t * a = (const uint16_t *)bq4_K->scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; + + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * q4 = (const int *)bq4_K->qs + (iqs/2); + const int v1 = q4[0]; + const int v2 = q4[4]; + + const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); + const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); + const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); + + sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); + sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + + return dall * sumf_d - dmin * sumf_m; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +template +static __dpct_inline__ void +allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K, + int *tile_x_sc_q4_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q4_K; + *x_dm = tile_x_dm_q4_K; + *x_sc = tile_x_sc_q4_K; +} + +template +static __dpct_inline__ void +load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_K; // == 0 if QK_K == 256 + const int kqsx = k % QI4_K; // == k if QK_K == 256 + + const block_q4_K * bx0 = (const block_q4_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); + + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); +} + +static __dpct_inline__ float +vec_dot_q5_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#ifndef GGML_QKK_64 + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; + + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); + + vl[0] = ql[0]; + vl[1] = ql[4]; + + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = bq8i->ds[0]; + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); + +#else + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int8_t * s = bq5_K->scales; + + const float d = bq5_K->d; + + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * ql = (const int *)bq5_K->qs + (iqs/2); + const int vl1 = ql[0]; + const int vl2 = ql[4]; + + const int step = 4 * (iqs/2); // 0, 4, 8, 12 + const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 + const int in = step%8; // 0, 4, 0, 4 + const int vh = (*((const int *)(bq5_K->qh + in))) >> im; + + const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); + const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); + const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); + const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); + + const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1]) + + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]); + + return d * sumf_d; + +#else + bad_arch(); +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +template +static __dpct_inline__ void +allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K, + int *tile_x_sc_q5_K) { + (void)x_qh; + + *x_ql = tile_x_ql_q5_K; + *x_dm = tile_x_dm_q5_K; + *x_sc = tile_x_sc_q5_K; +} + +template +static __dpct_inline__ void +load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_K; // == 0 if QK_K == 256 + const int kqsx = k % QI5_K; // == k if QK_K == 256 + + const block_q5_K * bx0 = (const block_q5_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR5_K*kqsx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); + const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; + const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; + + const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; + const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); + + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); +} + +static __dpct_inline__ float +vec_dot_q6_K_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + const int vl = get_int_from_uint8(bq6_K->ql, iqs); + const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; + + const int8_t * scales = bq6_K->scales + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + d8[i] = bq8_1[bq8_offset + 2 * i].ds[0]; + } + + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); +} + +template +static __dpct_inline__ void +allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) { + (void)x_qh; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template +static __dpct_inline__ void +load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql, + sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh, + int *__restrict__ x_sc, const int &i_offset, const int &i_max, + const int &k, const int &blocks_per_row) { + (void)x_qh; + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + const block_q6_K * bx0 = (const block_q6_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR6_K*kqsx; + + const int ql = get_int_from_uint8(bxi->ql, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); + const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; + const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; + + const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; + const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); + + x_ql[i * (2 * WARP_SIZE + 1) + kq0] = + dpct::vectorized_binary(ql0 | qh0, 0x20202020, + dpct::sub_sat()); + x_ql[i * (2 * WARP_SIZE + 1) + kq1] = + dpct::vectorized_binary(ql1 | qh1, 0x20202020, + dpct::sub_sat()); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = sycl::min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + } +} + +static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat( + const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm, + const int *__restrict__ x_qh, const int *__restrict__ x_sc, + const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds, + const int &i, const int &j, const int &k) { + (void)x_qh; + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); + + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); +} + +template +/* +DPCT1110:8: The total declared local variable size in device function mul_mat_q +exceeds 128 bytes and may cause high register pressure. Consult with your +hardware vendor to find the total register size available and adjust the code, +or use smaller sub-group size to avoid high register pressure. +*/ +static __dpct_inline__ void +mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy, + float *__restrict__ dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, + int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh, + int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + const int blocks_per_warp = WARP_SIZE / qi; + + const int & ncols_dst = ncols_y; + + const int row_dst_0 = item_ct1.get_group(2) * mmq_y; + const int & row_x_0 = row_dst_0; + + const int col_dst_0 = item_ct1.get_group(1) * mmq_x; + const int & col_y_0 = col_dst_0; + + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; + + for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { + + load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, + tile_x_qh, tile_x_sc, item_ct1.get_local_id(1), + nrows_x - row_x_0 - 1, item_ct1.get_local_id(2), + blocks_per_row_x); + +#pragma unroll + for (int ir = 0; ir < qr; ++ir) { + const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2); + const int kbxd = kqs / QI8_1; + +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = dpct::min( + (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i), + ncols_y - 1); // to prevent out-of-bounds memory accesses + + const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; + + const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE + + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned( + by0->qs, item_ct1.get_local_id(2) % QI8_1); + } + +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = + (ids0 + item_ct1.get_local_id(1) * QI8_1 + + item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) % + mmq_x; + const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1); + const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const sycl::half2 *dsi_src = + &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) + + ir * (WARP_SIZE / QI8_1) + kby] + .ds; + sycl::half2 *dsi_dst = + &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = (*dsi_src)[0]; + } + } + + /* + DPCT1118:9: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:56: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i / WARP_SIZE][j / nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, + tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i, + item_ct1.get_local_id(1) + j, k); + } + } + } + + /* + DPCT1118:10: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:57: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + } + } + +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1); + + if (col_dst >= ncols_dst) { + return; + } + +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i; + + if (row_dst >= nrows_dst) { + continue; + } + + dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; + } + } +} + +#define MMQ_X_Q4_0_RDNA2 64 +#define MMQ_Y_Q4_0_RDNA2 128 +#define NWARPS_Q4_0_RDNA2 8 +#define MMQ_X_Q4_0_RDNA1 64 +#define MMQ_Y_Q4_0_RDNA1 64 +#define NWARPS_Q4_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_0_AMPERE 4 +#define MMQ_Y_Q4_0_AMPERE 32 +#define NWARPS_Q4_0_AMPERE 4 +#else +#define MMQ_X_Q4_0_AMPERE 64 +#define MMQ_Y_Q4_0_AMPERE 128 +#define NWARPS_Q4_0_AMPERE 4 +#endif +#define MMQ_X_Q4_0_PASCAL 64 +#define MMQ_Y_Q4_0_PASCAL 64 +#define NWARPS_Q4_0_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q4_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_0_RDNA2; + const int mmq_y = MMQ_Y_Q4_0_RDNA2; + const int nwarps = NWARPS_Q4_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_0_RDNA1; + const int mmq_y = MMQ_Y_Q4_0_RDNA1; + const int nwarps = NWARPS_Q4_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + allocate_tiles_q4_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + + mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_0_AMPERE; + const int mmq_y = MMQ_Y_Q4_0_AMPERE; + const int nwarps = NWARPS_Q4_0_AMPERE; + allocate_tiles_q4_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q4_0, tile_x_d_q4_0); + mul_mat_q, VDR_Q4_0_Q8_1_MMQ, + vec_dot_q4_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_0_PASCAL; + const int mmq_y = MMQ_Y_Q4_0_PASCAL; + const int nwarps = NWARPS_Q4_0_PASCAL; + + allocate_tiles_q4_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q4_0_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q4_1_RDNA2 64 +#define MMQ_Y_Q4_1_RDNA2 128 +#define NWARPS_Q4_1_RDNA2 8 +#define MMQ_X_Q4_1_RDNA1 64 +#define MMQ_Y_Q4_1_RDNA1 64 +#define NWARPS_Q4_1_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_1_AMPERE 4 +#define MMQ_Y_Q4_1_AMPERE 32 +#define NWARPS_Q4_1_AMPERE 4 +#else +#define MMQ_X_Q4_1_AMPERE 64 +#define MMQ_Y_Q4_1_AMPERE 128 +#define NWARPS_Q4_1_AMPERE 4 +#endif +#define MMQ_X_Q4_1_PASCAL 64 +#define MMQ_Y_Q4_1_PASCAL 64 +#define NWARPS_Q4_1_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q4_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1, + sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_1_RDNA2; + const int mmq_y = MMQ_Y_Q4_1_RDNA2; + const int nwarps = NWARPS_Q4_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_1_RDNA1; + const int mmq_y = MMQ_Y_Q4_1_RDNA1; + const int nwarps = NWARPS_Q4_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + allocate_tiles_q4_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_1_AMPERE; + const int mmq_y = MMQ_Y_Q4_1_AMPERE; + const int nwarps = NWARPS_Q4_1_AMPERE; + allocate_tiles_q4_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q4_1, tile_x_dm_q4_1); + mul_mat_q, VDR_Q4_1_Q8_1_MMQ, + vec_dot_q4_1_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_1_PASCAL; + const int mmq_y = MMQ_Y_Q4_1_PASCAL; + const int nwarps = NWARPS_Q4_1_PASCAL; + allocate_tiles_q4_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q4_1_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_0_RDNA2 64 +#define MMQ_Y_Q5_0_RDNA2 128 +#define NWARPS_Q5_0_RDNA2 8 +#define MMQ_X_Q5_0_RDNA1 64 +#define MMQ_Y_Q5_0_RDNA1 64 +#define NWARPS_Q5_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_0_AMPERE 4 +#define MMQ_Y_Q5_0_AMPERE 32 +#define NWARPS_Q5_0_AMPERE 4 +#else +#define MMQ_X_Q5_0_AMPERE 128 +#define MMQ_Y_Q5_0_AMPERE 64 +#define NWARPS_Q5_0_AMPERE 4 +#endif +#define MMQ_X_Q5_0_PASCAL 64 +#define MMQ_Y_Q5_0_PASCAL 64 +#define NWARPS_Q5_0_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q5_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_0_RDNA2; + const int mmq_y = MMQ_Y_Q5_0_RDNA2; + const int nwarps = NWARPS_Q5_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_0_RDNA1; + const int mmq_y = MMQ_Y_Q5_0_RDNA1; + const int nwarps = NWARPS_Q5_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + allocate_tiles_q5_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_0_AMPERE; + const int mmq_y = MMQ_Y_Q5_0_AMPERE; + const int nwarps = NWARPS_Q5_0_AMPERE; + allocate_tiles_q5_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_0, tile_x_d_q5_0); + mul_mat_q, VDR_Q5_0_Q8_1_MMQ, + vec_dot_q5_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_0_PASCAL; + const int mmq_y = MMQ_Y_Q5_0_PASCAL; + const int nwarps = NWARPS_Q5_0_PASCAL; + allocate_tiles_q5_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q5_0_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_1_RDNA2 64 +#define MMQ_Y_Q5_1_RDNA2 128 +#define NWARPS_Q5_1_RDNA2 8 +#define MMQ_X_Q5_1_RDNA1 64 +#define MMQ_Y_Q5_1_RDNA1 64 +#define NWARPS_Q5_1_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_1_AMPERE 4 +#define MMQ_Y_Q5_1_AMPERE 32 +#define NWARPS_Q5_1_AMPERE 4 +#else +#define MMQ_X_Q5_1_AMPERE 128 +#define MMQ_Y_Q5_1_AMPERE 64 +#define NWARPS_Q5_1_AMPERE 4 +#endif +#define MMQ_X_Q5_1_PASCAL 64 +#define MMQ_Y_Q5_1_PASCAL 64 +#define NWARPS_Q5_1_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1, + sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_1_RDNA2; + const int mmq_y = MMQ_Y_Q5_1_RDNA2; + const int nwarps = NWARPS_Q5_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_1_RDNA1; + const int mmq_y = MMQ_Y_Q5_1_RDNA1; + const int nwarps = NWARPS_Q5_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + allocate_tiles_q5_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_1_AMPERE; + const int mmq_y = MMQ_Y_Q5_1_AMPERE; + const int nwarps = NWARPS_Q5_1_AMPERE; + allocate_tiles_q5_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_1, tile_x_dm_q5_1); + mul_mat_q, VDR_Q5_1_Q8_1_MMQ, + vec_dot_q5_1_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_1_PASCAL; + const int mmq_y = MMQ_Y_Q5_1_PASCAL; + const int nwarps = NWARPS_Q5_1_PASCAL; + allocate_tiles_q5_1(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q5_1_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q8_0_RDNA2 64 +#define MMQ_Y_Q8_0_RDNA2 128 +#define NWARPS_Q8_0_RDNA2 8 +#define MMQ_X_Q8_0_RDNA1 64 +#define MMQ_Y_Q8_0_RDNA1 64 +#define NWARPS_Q8_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q8_0_AMPERE 4 +#define MMQ_Y_Q8_0_AMPERE 32 +#define NWARPS_Q8_0_AMPERE 4 +#else +#define MMQ_X_Q8_0_AMPERE 128 +#define MMQ_Y_Q8_0_AMPERE 64 +#define NWARPS_Q8_0_AMPERE 4 +#endif +#define MMQ_X_Q8_0_PASCAL 64 +#define MMQ_Y_Q8_0_PASCAL 64 +#define NWARPS_Q8_0_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q8_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q8_0_RDNA2; + const int mmq_y = MMQ_Y_Q8_0_RDNA2; + const int nwarps = NWARPS_Q8_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q8_0_RDNA1; + const int mmq_y = MMQ_Y_Q8_0_RDNA1; + const int nwarps = NWARPS_Q8_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + allocate_tiles_q8_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q8_0_AMPERE; + const int mmq_y = MMQ_Y_Q8_0_AMPERE; + const int nwarps = NWARPS_Q8_0_AMPERE; + allocate_tiles_q8_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_qs_q8_0, tile_x_d_q8_0); + mul_mat_q, VDR_Q8_0_Q8_1_MMQ, + vec_dot_q8_0_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q8_0_PASCAL; + const int mmq_y = MMQ_Y_Q8_0_PASCAL; + const int nwarps = NWARPS_Q8_0_PASCAL; + allocate_tiles_q8_0(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q8_0_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q2_K_RDNA2 64 +#define MMQ_Y_Q2_K_RDNA2 128 +#define NWARPS_Q2_K_RDNA2 8 +#define MMQ_X_Q2_K_RDNA1 128 +#define MMQ_Y_Q2_K_RDNA1 32 +#define NWARPS_Q2_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q2_K_AMPERE 4 +#define MMQ_Y_Q2_K_AMPERE 32 +#define NWARPS_Q2_K_AMPERE 4 +#else +#define MMQ_X_Q2_K_AMPERE 64 +#define MMQ_Y_Q2_K_AMPERE 128 +#define NWARPS_Q2_K_AMPERE 4 +#endif +#define MMQ_X_Q2_K_PASCAL 64 +#define MMQ_Y_Q2_K_PASCAL 64 +#define NWARPS_Q2_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q2_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K, + sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q2_K_RDNA2; + const int mmq_y = MMQ_Y_Q2_K_RDNA2; + const int nwarps = NWARPS_Q2_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q2_K_RDNA1; + const int mmq_y = MMQ_Y_Q2_K_RDNA1; + const int nwarps = NWARPS_Q2_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + allocate_tiles_q2_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q2_K_AMPERE; + const int mmq_y = MMQ_Y_Q2_K_AMPERE; + const int nwarps = NWARPS_Q2_K_AMPERE; + allocate_tiles_q2_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K); + mul_mat_q, VDR_Q2_K_Q8_1_MMQ, + vec_dot_q2_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q2_K_PASCAL; + const int mmq_y = MMQ_Y_Q2_K_PASCAL; + const int nwarps = NWARPS_Q2_K_PASCAL; + allocate_tiles_q2_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q2_K_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q3_K_RDNA2 128 +#define MMQ_Y_Q3_K_RDNA2 64 +#define NWARPS_Q3_K_RDNA2 8 +#define MMQ_X_Q3_K_RDNA1 32 +#define MMQ_Y_Q3_K_RDNA1 128 +#define NWARPS_Q3_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q3_K_AMPERE 4 +#define MMQ_Y_Q3_K_AMPERE 32 +#define NWARPS_Q3_K_AMPERE 4 +#else +#define MMQ_X_Q3_K_AMPERE 128 +#define MMQ_Y_Q3_K_AMPERE 128 +#define NWARPS_Q3_K_AMPERE 4 +#endif +#define MMQ_X_Q3_K_PASCAL 64 +#define MMQ_Y_Q3_K_PASCAL 64 +#define NWARPS_Q3_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q3_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K, + sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K, + int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q3_K_RDNA2; + const int mmq_y = MMQ_Y_Q3_K_RDNA2; + const int nwarps = NWARPS_Q3_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q3_K_RDNA1; + const int mmq_y = MMQ_Y_Q3_K_RDNA1; + const int nwarps = NWARPS_Q3_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + allocate_tiles_q3_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q3_K_AMPERE; + const int mmq_y = MMQ_Y_Q3_K_AMPERE; + const int nwarps = NWARPS_Q3_K_AMPERE; + allocate_tiles_q3_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K, + tile_x_sc_q3_K); + mul_mat_q, VDR_Q3_K_Q8_1_MMQ, + vec_dot_q3_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q3_K_PASCAL; + const int mmq_y = MMQ_Y_Q3_K_PASCAL; + const int nwarps = NWARPS_Q3_K_PASCAL; + allocate_tiles_q3_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q3_K_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q4_K_RDNA2 64 +#define MMQ_Y_Q4_K_RDNA2 128 +#define NWARPS_Q4_K_RDNA2 8 +#define MMQ_X_Q4_K_RDNA1 32 +#define MMQ_Y_Q4_K_RDNA1 64 +#define NWARPS_Q4_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_K_AMPERE 4 +#define MMQ_Y_Q4_K_AMPERE 32 +#define NWARPS_Q4_K_AMPERE 4 +#else +#define MMQ_X_Q4_K_AMPERE 64 +#define MMQ_Y_Q4_K_AMPERE 128 +#define NWARPS_Q4_K_AMPERE 4 +#endif +#define MMQ_X_Q4_K_PASCAL 64 +#define MMQ_Y_Q4_K_PASCAL 64 +#define NWARPS_Q4_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q4_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K, + sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_K_RDNA2; + const int mmq_y = MMQ_Y_Q4_K_RDNA2; + const int nwarps = NWARPS_Q4_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_K_RDNA1; + const int mmq_y = MMQ_Y_Q4_K_RDNA1; + const int nwarps = NWARPS_Q4_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + allocate_tiles_q4_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_K_AMPERE; + const int mmq_y = MMQ_Y_Q4_K_AMPERE; + const int nwarps = NWARPS_Q4_K_AMPERE; + allocate_tiles_q4_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K); + mul_mat_q, VDR_Q4_K_Q8_1_MMQ, + vec_dot_q4_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_K_PASCAL; + const int mmq_y = MMQ_Y_Q4_K_PASCAL; + const int nwarps = NWARPS_Q4_K_PASCAL; + allocate_tiles_q4_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q4_K_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_K_RDNA2 64 +#define MMQ_Y_Q5_K_RDNA2 128 +#define NWARPS_Q5_K_RDNA2 8 +#define MMQ_X_Q5_K_RDNA1 32 +#define MMQ_Y_Q5_K_RDNA1 64 +#define NWARPS_Q5_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_K_AMPERE 4 +#define MMQ_Y_Q5_K_AMPERE 32 +#define NWARPS_Q5_K_AMPERE 4 +#else +#define MMQ_X_Q5_K_AMPERE 64 +#define MMQ_Y_Q5_K_AMPERE 128 +#define NWARPS_Q5_K_AMPERE 4 +#endif +#define MMQ_X_Q5_K_PASCAL 64 +#define MMQ_Y_Q5_K_PASCAL 64 +#define NWARPS_Q5_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K, + sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs, + sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_K_RDNA2; + const int mmq_y = MMQ_Y_Q5_K_RDNA2; + const int nwarps = NWARPS_Q5_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_K_RDNA1; + const int mmq_y = MMQ_Y_Q5_K_RDNA1; + const int nwarps = NWARPS_Q5_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + allocate_tiles_q5_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_K_AMPERE; + const int mmq_y = MMQ_Y_Q5_K_AMPERE; + const int nwarps = NWARPS_Q5_K_AMPERE; + allocate_tiles_q5_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K); + mul_mat_q, VDR_Q5_K_Q8_1_MMQ, + vec_dot_q5_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_K_PASCAL; + const int mmq_y = MMQ_Y_Q5_K_PASCAL; + const int nwarps = NWARPS_Q5_K_PASCAL; + allocate_tiles_q5_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q5_K_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q6_K_RDNA2 64 +#define MMQ_Y_Q6_K_RDNA2 128 +#define NWARPS_Q6_K_RDNA2 8 +#define MMQ_X_Q6_K_RDNA1 32 +#define MMQ_Y_Q6_K_RDNA1 64 +#define NWARPS_Q6_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q6_K_AMPERE 4 +#define MMQ_Y_Q6_K_AMPERE 32 +#define NWARPS_Q6_K_AMPERE 4 +#else +#define MMQ_X_Q6_K_AMPERE 64 +#define MMQ_Y_Q6_K_AMPERE 64 +#define NWARPS_Q6_K_AMPERE 4 +#endif +#define MMQ_X_Q6_K_PASCAL 64 +#define MMQ_Y_Q6_K_PASCAL 64 +#define NWARPS_Q6_K_PASCAL 8 + +template static void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q6_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, + const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm, + int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) { + int * tile_x_ql = nullptr; + sycl::half2 *tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q6_K_RDNA2; + const int mmq_y = MMQ_Y_Q6_K_RDNA2; + const int nwarps = NWARPS_Q6_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q6_K_RDNA1; + const int mmq_y = MMQ_Y_Q6_K_RDNA1; + const int nwarps = NWARPS_Q6_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + allocate_tiles_q6_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); + +#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA + const int mmq_x = MMQ_X_Q6_K_AMPERE; + const int mmq_y = MMQ_Y_Q6_K_AMPERE; + const int nwarps = NWARPS_Q6_K_AMPERE; + allocate_tiles_q6_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, + tile_x_ql, tile_x_dm, tile_x_sc); + mul_mat_q, VDR_Q6_K_Q8_1_MMQ, + vec_dot_q6_K_q8_1_mul_mat>( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, + tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds); + +#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q6_K_PASCAL; + const int mmq_y = MMQ_Y_Q6_K_PASCAL; + const int nwarps = NWARPS_Q6_K_PASCAL; + allocate_tiles_q6_K(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc); +#else + (void) vec_dot_q6_K_q8_1_mul_mat; + bad_arch(); +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +template +static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { + const int ibx = row * blocks_per_row + i + + item_ct1.get_local_id(2) / (qi / vdr); // x block index + + const int iby = (i + item_ct1.get_local_id(2) / (qi / vdr)) * + (qk / QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int tid = item_ct1.get_local_id(2); + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_CUDA_F16 + half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_CUDA_F16 + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_CUDA_F16 + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); +#else + tmp += v.x() * y[iybs + iqs + j / qr + 0]; + tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; +#endif // GGML_CUDA_F16 + } + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { +#ifdef GGML_CUDA_F16 + dst[row] = tmp.x + tmp.y; +#else + dst[row] = tmp; +#endif // GGML_CUDA_F16 + } +} + +static void mul_mat_p021_f16_f32( + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, + const sycl::nd_item<3> &item_ct1) { + + const sycl::half *x = (const sycl::half *)vx; + + const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0); + const int channel_x = channel / (nchannels_y / nchannels_x); + + const int nrows_y = ncols_x; + const int nrows_dst = nrows_x; + const int row_dst = row_x; + + float tmp = 0.0f; + + for (int col_x0 = 0; col_x0 < ncols_x; + col_x0 += item_ct1.get_local_range(2)) { + const int col_x = col_x0 + item_ct1.get_local_id(2); + + if (col_x >= ncols_x) { + break; + } + + // x is transposed and permuted + const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x; + const float xi = + sycl::vec(x[ix]) + .convert()[0]; + + const int row_y = col_x; + + + // y is not transposed but permuted + const int iy = channel*nrows_y + row_y; + + tmp += xi * y[iy]; + } + + // dst is not transposed and not permuted + const int idst = channel*nrows_dst + row_dst; + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[idst] = tmp; + } +} + +static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, + const int row_stride_x, const int channel_stride_x, const int channel_x_divisor, + const sycl::nd_item<3> &item_ct1) { + + const sycl::half *x = (const sycl::half *)vx; + + const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0); + const int channel_x = channel / channel_x_divisor; + + const int nrows_y = ncols_x; + const int nrows_dst = nrows_x; + const int row_dst = row_x; + + const int idst = channel*nrows_dst + row_dst; + + float tmp = 0.0f; + + for (int col_x0 = 0; col_x0 < ncols_x; + col_x0 += item_ct1.get_local_range(2)) { + const int col_x = col_x0 + item_ct1.get_local_id(2); + + if (col_x >= ncols_x) { + break; + } + + const int row_y = col_x; + + const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; + const int iy = channel*nrows_y + row_y; + + const float xi = + sycl::vec(x[ix]) + .convert()[0]; + + tmp += xi * y[iy]; + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[idst] = tmp; + } +} + +static void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + sycl::half *dsti = (sycl::half *)cdsti; + + *dsti = sycl::vec(*xi) + .convert()[0]; +} + +static void cpy_1_f16_f16(const char * cxi, char * cdsti) { + const sycl::half *xi = (const sycl::half *)cxi; + sycl::half *dsti = (sycl::half *)cdsti; + + *dsti = *xi; +} + +template +static void cpy_f32_f16(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= ne) { + return; + } + + // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int i02 = i / (ne00*ne01); + const int i01 = (i - i02*ne01*ne00) / ne00; + const int i00 = i - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + + const int i12 = i / (ne10*ne11); + const int i11 = (i - i12*ne10*ne11) / ne10; + const int i10 = i - i12*ne10*ne11 - i11*ne10; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = sycl::fmax(amax, sycl::fabs((float)v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j]*id; + + dsti->qs[j] = sycl::round((float)x0); + } +} + +static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float)v)) { + amax = sycl::fabs((float)v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = xi[0 + j]*id; + const float x1 = xi[QK4_0/2 + j]*id; + + const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + if (v < vmin) vmin = v; + if (v > vmax) vmax = v; + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = vmin; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (xi[0 + j] - vmin)*id; + const float x1 = (xi[QK4_1/2 + j] - vmin)*id; + + const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +template +static void cpy_f32_q(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, + const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, + const sycl::nd_item<3> &item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2)) * + qk; + + if (i >= ne) { + return; + } + + const int i02 = i / (ne00*ne01); + const int i01 = (i - i02*ne01*ne00) / ne00; + const int i00 = (i - i02*ne01*ne00 - i01*ne00); + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + + const int i12 = i / (ne10*ne11); + const int i11 = (i - i12*ne10*ne11) / ne10; + const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + +static float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low); + return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y)); +} + +struct rope_corr_dims { + float v[4]; +}; + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static void rope_yarn( + float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale); + } + *cos_theta = sycl::cos(theta) * mscale; + *sin_theta = sycl::sin(theta) * mscale; +} + +// rope == RoPE == rotary positional embedding +template +static void rope( + const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + float ext_factor, float attn_factor, rope_corr_dims corr_dims +, + const sycl::nd_item<3> &item_ct1) { + const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (col >= ncols) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int i = row*ncols + col; + const int i2 = row/p_delta_rows; + + const int p = has_pos ? pos[i2] : 0; + const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + 1]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + 1] = x0*sin_theta + x1*cos_theta; +} + +template +static void rope_neox( + const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims +, + const sycl::nd_item<3> &item_ct1) { + const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + + if (col >= ncols) { + return; + } + + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int ib = col / n_dims; + const int ic = col % n_dims; + + if (ib > 0) { + const int i = row*ncols + ib*n_dims + ic; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ncols + ib*n_dims + ic/2; + const int i2 = row/p_delta_rows; + + float cur_rot = inv_ndims * ic - ib; + + const int p = has_pos ? pos[i2] : 0; + const float theta_base = + p * freq_scale * dpct::pow(theta_scale, col / 2.0f); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + n_dims/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; +} + +static void rope_glm_f32( + const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + int n_ctx +, const sycl::nd_item<3> &item_ct1) { + const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int half_n_dims = ncols/4; + + if (col >= half_n_dims) { + return; + } + + const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i = row*ncols + col; + const int i2 = row/p_delta_rows; + + const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols); + // FIXME: this is likely wrong + const int p = pos != nullptr ? pos[i2] : 0; + + const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale; + const float sin_theta = sycl::sin((float)theta); + const float cos_theta = sycl::cos((float)theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + half_n_dims]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; + + const float block_theta = + ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale; + const float sin_block_theta = sycl::sin((float)block_theta); + const float cos_block_theta = sycl::cos((float)block_theta); + + const float x2 = x[i + half_n_dims * 2]; + const float x3 = x[i + half_n_dims * 3]; + + dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta; + dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; +} + +static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows, + const int n_heads_log2_floor, const float m0, const float m1, + const sycl::nd_item<3> &item_ct1) { + const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (col >= ncols) { + return; + } + + const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i = row*ncols + col; + + const int k = row/k_rows; + + float m_k; + if (k < n_heads_log2_floor) { + m_k = dpct::pow(m0, k + 1); + } else { + m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + dst[i] = col * m_k + x[i]; +} + +static void k_sum_rows_f32(const float * x, float * dst, const int ncols, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(1); + const int col = item_ct1.get_local_id(2); + + float sum = 0.0f; + for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) { + sum += x[row * ncols + i]; + } + + sum = warp_reduce_sum(sum, item_ct1); + + if (col == 0) { + dst[row] = sum; + } +} + +template +static inline void swap(T & a, T & b) { + T tmp = a; + a = b; + b = tmp; +} + +template +static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, + const sycl::nd_item<3> &item_ct1) { + // bitonic sort + int col = item_ct1.get_local_id(2); + int row = item_ct1.get_group(1); + + if (col >= ncols) return; + + const float * x_row = x + row * ncols; + int * dst_row = dst + row * ncols; + + // initialize indices + if (col < ncols) { + dst_row[col] = col; + } + /* + DPCT1065:58: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better + performance if there is no access to global memory. + */ + item_ct1.barrier(); + + for (int k = 2; k <= ncols; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + swap(dst_row[col], dst_row[ixj]); + } + } else { + if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + swap(dst_row[col], dst_row[ixj]); + } + } + } + /* + DPCT1118:11: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:59: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + } + } +} + +static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past, + const sycl::nd_item<3> &item_ct1) { + const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (col >= ncols) { + return; + } + + const int i = row*ncols + col; + //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; + //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU + dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; +} + +static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale, + const sycl::nd_item<3> &item_ct1, float *buf) { + const int tid = item_ct1.get_local_id(2); + const int rowx = item_ct1.get_group(2); + const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension + + const int block_size = item_ct1.get_local_range(2); + + const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + + float max_val = -INFINITY; + + for (int col = tid; col < ncols; col += block_size) { + const int ix = rowx*ncols + col; + const int iy = rowy*ncols + col; + max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f)); + } + + // find the max value in the block + max_val = warp_reduce_max(max_val, item_ct1); + if (block_size > WARP_SIZE) { + if (warp_id == 0) { + buf[lane_id] = -INFINITY; + } + /* + DPCT1118:12: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:60: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + if (lane_id == 0) { + buf[warp_id] = max_val; + } + /* + DPCT1118:13: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:61: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + max_val = buf[lane_id]; + max_val = warp_reduce_max(max_val, item_ct1); + } + + float tmp = 0.f; + + for (int col = tid; col < ncols; col += block_size) { + const int ix = rowx*ncols + col; + const int iy = rowy*ncols + col; + const float val = + sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val); + tmp += val; + dst[ix] = val; + } + + // find the sum of exps in the block + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + if (warp_id == 0) { + buf[lane_id] = 0.f; + } + /* + DPCT1118:14: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:62: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + if (lane_id == 0) { + buf[warp_id] = tmp; + } + /* + DPCT1118:15: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + /* + DPCT1065:63: Consider replacing sycl::nd_item::barrier() with + sycl::nd_item::barrier(sycl::access::fence_space::local_space) for + better performance if there is no access to global memory. + */ + item_ct1.barrier(); + + tmp = buf[lane_id]; + tmp = warp_reduce_sum(tmp, item_ct1); + } + + const float inv_tmp = 1.f / tmp; + + for (int col = tid; col < ncols; col += block_size) { + const int i = rowx*ncols + col; + dst[i] *= inv_tmp; + } +} + +static void scale_f32(const float * x, float * dst, const float scale, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + dst[i] = scale * x[i]; +} + +static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); +} + +static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta, + int IW, int IH, int OW, int KW, int KH, + int pelements, int CHW, int s0, int s1, int p0, + int p1, int d0, int d1, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (i >= pelements) { + return; + } + + const int ksize = OW * (KH > 1 ? KW : 1); + const int kx = i / ksize; + const int kd = kx * ksize; + const int ky = (i - kd) / OW; + const int ix = i % OW; + + const int64_t iiw = ix * s0 + kx * d0 - p0; + const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1; + + const int64_t offset_dst = + (item_ct1.get_group(1) * OW + ix) * CHW + + (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx); + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst[offset_dst] = + sycl::vec(0.0f) + .convert()[0]; + } else { + const int64_t offset_src = item_ct1.get_group(0) * offset_delta; + dst[offset_dst] = + sycl::vec(x[offset_src + iih * IW + iiw]) + .convert()[0]; + } +} + +template +static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const void *src0_dd, + const int32_t *src1_dd, float *dst_dd, + dpct::queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + GGML_ASSERT(ne00 % 2 == 0); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_get_rows( + src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, + s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); + }); + + (void) dst; +} + +template +static void get_rows_cuda_float(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const src0_t *src0_dd, const int32_t *src1_dd, + float *dst_dd, dpct::queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, + s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); + }); + } + + (void) dst; +} + +template +struct bin_bcast_cuda { + template + void operator()(const struct ggml_tensor *src0, + const struct ggml_tensor *src1, struct ggml_tensor *dst, + const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd, + dpct::queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + int nr0 = ne10/ne0; + int nr1 = ne11/ne1; + int nr2 = ne12/ne2; + int nr3 = ne13/ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne0[] = {ne0, ne1, ne2, ne3}; + int64_t cne1[] = {ne10, ne11, ne12, ne13}; + size_t cnb0[] = {nb0, nb1, nb2, nb3}; + size_t cnb1[] = {nb10, nb11, nb12, nb13}; + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne0); + collapse(cne1); + } + } + { + int64_t ne0 = cne0[0]; + int64_t ne1 = cne0[1]; + int64_t ne2 = cne0[2]; + int64_t ne3 = cne0[3]; + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb0[0]; + size_t nb1 = cnb0[1]; + size_t nb2 = cnb0[2]; + size_t nb3 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0/2LL, 1LL); + + sycl::range<3> block_dims(1, 1, 1); + block_dims[2] = std::min(hne0, block_size); + block_dims[1] = std::min( + ne1, block_size / (unsigned int)block_dims[2]); + block_dims[0] = std::min( + std::min( + ne2 * ne3, block_size / (unsigned int)block_dims[2] / + (unsigned int)block_dims[1]), + 64U); + + sycl::range<3> block_nums( + (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], + (ne1 + block_dims[1] - 1) / block_dims[1], + (hne0 + block_dims[2] - 1) / block_dims[2]); + + if (block_nums[0] > 65535) { + // this is the maximum number of blocks in z direction, fallback to 1D grid kernel + int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * + sycl::range<3>(1, 1, block_size), + sycl::range<3>(1, 1, block_size)), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast_unravel( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, + s13, item_ct1); + }); + } + } else { + /* + DPCT1049:16: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, + ne2, ne3, ne10, ne11, ne12, ne13, + s1, s2, s3, s11, s12, s13, + item_ct1); + }); + } + } + } +}; + +static void acc_f32_cuda(const float *x, const float *y, float *dst, + const int n_elements, const int ne10, const int ne11, + const int ne12, const int nb1, const int nb2, + const int offset, dpct::queue_ptr stream) { + int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, + item_ct1); + }); +} + +static void gelu_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + gelu_f32(x, dst, k, item_ct1); + }); +} + +static void silu_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + silu_f32(x, dst, k, item_ct1); + }); +} + +static void gelu_quick_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + gelu_quick_f32(x, dst, k, item_ct1); + }); +} + +static void tanh_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + tanh_f32(x, dst, k, item_ct1); + }); +} + +static void relu_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + relu_f32(x, dst, k, item_ct1); + }); +} + +static void leaky_relu_f32_cuda(const float *x, float *dst, const int k, + const float negative_slope, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + leaky_relu_f32(x, dst, k, negative_slope, item_ct1); + }); +} + +static void sqr_f32_cuda(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + sqr_f32(x, dst, k, item_ct1); + }); +} + +static void norm_f32_cuda(const float *x, float *dst, const int ncols, + const int nrows, const float eps, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1( + sycl::range<1>(32), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + norm_f32(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } else { + const sycl::range<3> block_dims(1, 1, 1024); + /* + DPCT1049:17: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1( + sycl::range<1>(32), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + norm_f32<1024>(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } +} + +static void group_norm_f32_cuda(const float *x, float *dst, + const int num_groups, const int group_size, + const int ne_elements, dpct::queue_ptr stream) { + static const float eps = 1e-6f; + if (group_size < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), + cgh); + + const float eps_ct4 = eps; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + group_norm_f32( + x, dst, group_size, ne_elements, eps_ct4, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } else { + const sycl::range<3> block_dims(1, 1, 1024); + /* + DPCT1049:18: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), + cgh); + + const float eps_ct4 = eps; + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + group_norm_f32<1024>(x, dst, group_size, ne_elements, + eps_ct4, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } +} + +static void concat_f32_cuda(const float *x, const float *y, float *dst, + const int ne0, int ne1, int ne2, int ne02, + dpct::queue_ptr stream) { + int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE; + sycl::range<3> gridDim(ne2, ne1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32(x, y, dst, ne0, ne02, item_ct1); + }); +} + +static void upscale_f32_cuda(const float *x, float *dst, const int ne00, + const int ne01, const int ne02, + const int scale_factor, dpct::queue_ptr stream) { + int ne0 = (ne00 * scale_factor); + int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; + sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks); + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1); + }); +} + +static void pad_f32_cuda(const float *x, float *dst, const int ne00, + const int ne01, const int ne02, const int ne0, + const int ne1, const int ne2, dpct::queue_ptr stream) { + int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; + sycl::range<3> gridDim(ne2, ne1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1); + }); +} + +static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols, + const int nrows, const float eps, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), + cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + rms_norm_f32(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } else { + const sycl::range<3> block_dims(1, 1, 1024); + /* + DPCT1049:19: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(32), + cgh); + + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1, + s_sum_acc_ct1.get_pointer()); + }); + }); + } +} + +static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx, + const int ky, const int kx_padded, + dpct::queue_ptr stream) { + const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const sycl::range<3> num_blocks(1, ky, block_num_x); + const sycl::range<3> block_size(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(num_blocks * block_size, block_size), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + quantize_q8_1(x, vy, kx, kx_padded, item_ct1); + }); + } +} + +template +static void dequantize_block_cuda(const void *__restrict__ vx, + dst_t *__restrict__ y, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block(vx, y, k, item_ct1); + }); + } +} + +template +static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q2_K(vx, y, item_ct1); + }); + } +#else + dequantize_block_q2_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K(vx, y, item_ct1); + }); + } +#else + dequantize_block_q3_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_K(vx, y, item_ct1); + }); + } +} + +template +static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q5_K(vx, y, item_ct1); + }); + } +#else + dequantize_block_q5_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q6_K(vx, y, item_ct1); + }); + } +#else + dequantize_block_q6_K<<>>(vx, y); +#endif +} + +static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_block_cuda; + case GGML_TYPE_Q4_1: + return dequantize_block_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + return dequantize_block_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; + case GGML_TYPE_F32: + return dequantize_block_cuda<1, 1, convert_f32>; + default: + return nullptr; + } +} + +static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_block_cuda; + case GGML_TYPE_Q4_1: + return dequantize_block_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + return dequantize_block_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; + case GGML_TYPE_F16: + return dequantize_block_cuda<1, 1, convert_f16>; + default: + return nullptr; + } +} + +static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const sycl::range<3> block_dims(1, 1, 32); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, 32); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, + nrows, item_ct1); + }); + } +} + +static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_1 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK5_1 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK8_0 == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q(vx, vy, dst, ncols, nrows, + item_ct1); + }); +} + +static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_0_RDNA2; + mmq_y = MMQ_Y_Q4_0_RDNA2; + nwarps = NWARPS_Q4_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_0_RDNA1; + mmq_y = MMQ_Y_Q4_0_RDNA1; + nwarps = NWARPS_Q4_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_0_AMPERE; + mmq_y = MMQ_Y_Q4_0_AMPERE; + nwarps = NWARPS_Q4_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_0_PASCAL; + mmq_y = MMQ_Y_Q4_0_PASCAL; + nwarps = NWARPS_Q4_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:20: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q4_0_acc_ct1.get_pointer(), + tile_x_d_q4_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:21: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q4_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q4_0_acc_ct1.get_pointer(), + tile_x_d_q4_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_1_RDNA2; + mmq_y = MMQ_Y_Q4_1_RDNA2; + nwarps = NWARPS_Q4_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_1_RDNA1; + mmq_y = MMQ_Y_Q4_1_RDNA1; + nwarps = NWARPS_Q4_1_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_1_AMPERE; + mmq_y = MMQ_Y_Q4_1_AMPERE; + nwarps = NWARPS_Q4_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_1_PASCAL; + mmq_y = MMQ_Y_Q4_1_PASCAL; + nwarps = NWARPS_Q4_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:22: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); + sycl::local_accessor tile_x_dm_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q4_1_acc_ct1.get_pointer(), + tile_x_dm_q4_1_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:23: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh); + sycl::local_accessor tile_x_dm_q4_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q4_1_acc_ct1.get_pointer(), + tile_x_dm_q4_1_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_0_RDNA2; + mmq_y = MMQ_Y_Q5_0_RDNA2; + nwarps = NWARPS_Q5_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_0_RDNA1; + mmq_y = MMQ_Y_Q5_0_RDNA1; + nwarps = NWARPS_Q5_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_0_AMPERE; + mmq_y = MMQ_Y_Q5_0_AMPERE; + nwarps = NWARPS_Q5_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_0_PASCAL; + mmq_y = MMQ_Y_Q5_0_PASCAL; + nwarps = NWARPS_Q5_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:24: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_0_acc_ct1.get_pointer(), + tile_x_d_q5_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:25: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q5_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_0_acc_ct1.get_pointer(), + tile_x_d_q5_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_1_RDNA2; + mmq_y = MMQ_Y_Q5_1_RDNA2; + nwarps = NWARPS_Q5_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_1_RDNA1; + mmq_y = MMQ_Y_Q5_1_RDNA1; + nwarps = NWARPS_Q5_1_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_1_AMPERE; + mmq_y = MMQ_Y_Q5_1_AMPERE; + nwarps = NWARPS_Q5_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_1_PASCAL; + mmq_y = MMQ_Y_Q5_1_PASCAL; + nwarps = NWARPS_Q5_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:26: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_1_acc_ct1.get_pointer(), + tile_x_dm_q5_1_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:27: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q5_1_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_1( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_1_acc_ct1.get_pointer(), + tile_x_dm_q5_1_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q8_0_RDNA2; + mmq_y = MMQ_Y_Q8_0_RDNA2; + nwarps = NWARPS_Q8_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q8_0_RDNA1; + mmq_y = MMQ_Y_Q8_0_RDNA1; + nwarps = NWARPS_Q8_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q8_0_AMPERE; + mmq_y = MMQ_Y_Q8_0_AMPERE; + nwarps = NWARPS_Q8_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q8_0_PASCAL; + mmq_y = MMQ_Y_Q8_0_PASCAL; + nwarps = NWARPS_Q8_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:28: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q8_0_acc_ct1.get_pointer(), + tile_x_d_q8_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:29: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_qs_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_d_q8_0_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0), + cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q8_0( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_qs_q8_0_acc_ct1.get_pointer(), + tile_x_d_q8_0_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q2_K_RDNA2; + mmq_y = MMQ_Y_Q2_K_RDNA2; + nwarps = NWARPS_Q2_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q2_K_RDNA1; + mmq_y = MMQ_Y_Q2_K_RDNA1; + nwarps = NWARPS_Q2_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q2_K_AMPERE; + mmq_y = MMQ_Y_Q2_K_AMPERE; + nwarps = NWARPS_Q2_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q2_K_PASCAL; + mmq_y = MMQ_Y_Q2_K_PASCAL; + nwarps = NWARPS_Q2_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:30: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K), + cgh); + sycl::local_accessor tile_x_sc_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q2_K_acc_ct1.get_pointer(), + tile_x_dm_q2_K_acc_ct1.get_pointer(), + tile_x_sc_q2_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:31: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K), + cgh); + sycl::local_accessor tile_x_sc_q2_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q2_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q2_K_acc_ct1.get_pointer(), + tile_x_dm_q2_K_acc_ct1.get_pointer(), + tile_x_sc_q2_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + +#if QK_K == 256 + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q3_K_RDNA2; + mmq_y = MMQ_Y_Q3_K_RDNA2; + nwarps = NWARPS_Q3_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q3_K_RDNA1; + mmq_y = MMQ_Y_Q3_K_RDNA1; + nwarps = NWARPS_Q3_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q3_K_AMPERE; + mmq_y = MMQ_Y_Q3_K_AMPERE; + nwarps = NWARPS_Q3_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q3_K_PASCAL; + mmq_y = MMQ_Y_Q3_K_PASCAL; + nwarps = NWARPS_Q3_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:32: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K), + cgh); + sycl::local_accessor tile_x_qh_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh); + sycl::local_accessor tile_x_sc_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q3_K_acc_ct1.get_pointer(), + tile_x_dm_q3_K_acc_ct1.get_pointer(), + tile_x_qh_q3_K_acc_ct1.get_pointer(), + tile_x_sc_q3_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:33: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K), + cgh); + sycl::local_accessor tile_x_qh_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh); + sycl::local_accessor tile_x_sc_q3_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q3_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q3_K_acc_ct1.get_pointer(), + tile_x_dm_q3_K_acc_ct1.get_pointer(), + tile_x_qh_q3_K_acc_ct1.get_pointer(), + tile_x_sc_q3_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +#endif +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_K_RDNA2; + mmq_y = MMQ_Y_Q4_K_RDNA2; + nwarps = NWARPS_Q4_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_K_RDNA1; + mmq_y = MMQ_Y_Q4_K_RDNA1; + nwarps = NWARPS_Q4_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_K_AMPERE; + mmq_y = MMQ_Y_Q4_K_AMPERE; + nwarps = NWARPS_Q4_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_K_PASCAL; + mmq_y = MMQ_Y_Q4_K_PASCAL; + nwarps = NWARPS_Q4_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:34: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K), + cgh); + sycl::local_accessor tile_x_sc_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q4_K_acc_ct1.get_pointer(), + tile_x_dm_q4_K_acc_ct1.get_pointer(), + tile_x_sc_q4_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:35: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K), + cgh); + sycl::local_accessor tile_x_sc_q4_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q4_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q4_K_acc_ct1.get_pointer(), + tile_x_dm_q4_K_acc_ct1.get_pointer(), + tile_x_sc_q4_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_K_RDNA2; + mmq_y = MMQ_Y_Q5_K_RDNA2; + nwarps = NWARPS_Q5_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_K_RDNA1; + mmq_y = MMQ_Y_Q5_K_RDNA1; + nwarps = NWARPS_Q5_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_K_AMPERE; + mmq_y = MMQ_Y_Q5_K_AMPERE; + nwarps = NWARPS_Q5_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_K_PASCAL; + mmq_y = MMQ_Y_Q5_K_PASCAL; + nwarps = NWARPS_Q5_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:36: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K), + cgh); + sycl::local_accessor tile_x_sc_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_K_acc_ct1.get_pointer(), + tile_x_dm_q5_K_acc_ct1.get_pointer(), + tile_x_sc_q5_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:37: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K), + cgh); + sycl::local_accessor tile_x_sc_q5_K_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q5_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_q5_K_acc_ct1.get_pointer(), + tile_x_dm_q5_K_acc_ct1.get_pointer(), + tile_x_sc_q5_K_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy, + float *dst, const int ncols_x, + const int nrows_x, const int ncols_y, + const int nrows_y, const int nrows_dst, + dpct::queue_ptr stream) try { + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + const int compute_capability = g_device_caps[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q6_K_RDNA2; + mmq_y = MMQ_Y_Q6_K_RDNA2; + nwarps = NWARPS_Q6_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q6_K_RDNA1; + mmq_y = MMQ_Y_Q6_K_RDNA1; + nwarps = NWARPS_Q6_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q6_K_AMPERE; + mmq_y = MMQ_Y_Q6_K_AMPERE; + nwarps = NWARPS_Q6_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q6_K_PASCAL; + mmq_y = MMQ_Y_Q6_K_PASCAL; + nwarps = NWARPS_Q6_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const sycl::range<3> block_nums(1, block_num_y, block_num_x); + const sycl::range<3> block_dims(1, nwarps, WARP_SIZE); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + /* + DPCT1049:38: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } else { + const bool need_check = true; + /* + DPCT1049:39: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor tile_x_ql_acc_ct1( + sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh); + sycl::local_accessor tile_x_dm_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K), + cgh); + sycl::local_accessor tile_x_sc_acc_ct1( + sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh); + sycl::local_accessor tile_y_qs_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE), cgh); + sycl::local_accessor tile_y_ds_acc_ct1( + sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + mul_mat_q6_K( + vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, + nrows_dst, item_ct1, + tile_x_ql_acc_ct1.get_pointer(), + tile_x_dm_acc_ct1.get_pointer(), + tile_x_sc_acc_ct1.get_pointer(), + tile_y_qs_acc_ct1.get_pointer(), + tile_y_ds_acc_ct1.get_pointer()); + }); + }); + } + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y, + float *dst, const int ncols_x, + const int nrows_x, + const int nchannels_x, + const int nchannels_y, + dpct::queue_ptr stream) { + + const sycl::range<3> block_nums(nchannels_y, nrows_x, 1); + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, + nchannels_y, item_ct1); + }); + } +} + +static void ggml_mul_mat_vec_nc_f16_f32_cuda( + const void *vx, const float *y, float *dst, const int ncols_x, + const int nrows_x, const int row_stride_x, const int nchannels_x, + const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) { + + const sycl::range<3> block_nums(nchannels_y, nrows_x, 1); + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, + row_stride_x, channel_stride_x, + nchannels_y / nchannels_x, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, + nb02, ne10, ne11, nb10, nb11, nb12, + item_ct1); + }); + } +} + +static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, + nb02, ne10, ne11, nb10, nb11, nb12, + item_ct1); + }); + } +} + +static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + GGML_ASSERT(ne % QK8_0 == 0); + const int num_blocks = ne / QK8_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), + sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q( + cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + GGML_ASSERT(ne % QK4_0 == 0); + const int num_blocks = ne / QK4_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), + sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q( + cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + GGML_ASSERT(ne % QK4_1 == 0); + const int num_blocks = ne / QK4_1; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), + sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q( + cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, + ne10, ne11, nb10, nb11, nb12, item_ct1); + }); +} + +static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne, + const int ne00, const int ne01, + const int nb00, const int nb01, + const int nb02, const int ne10, + const int ne11, const int nb10, + const int nb11, const int nb12, + dpct::queue_ptr stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, nb00, nb01, + nb02, ne10, ne11, nb10, nb11, nb12, + item_ct1); + }); + } +} + +static void scale_f32_cuda(const float *x, float *dst, const float scale, + const int k, dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + scale_f32(x, dst, scale, k, item_ct1); + }); +} + +static void clamp_f32_cuda(const float *x, float *dst, const float min, + const float max, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + clamp_f32(x, dst, min, max, k, item_ct1); + }); +} + +template +static void rope_cuda(const T *x, T *dst, int ncols, int nrows, + const int32_t *pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, + rope_corr_dims corr_dims, dpct::queue_ptr stream) { + GGML_ASSERT(ncols % 2 == 0); + const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nrows); + if (pos == nullptr) { + /* + DPCT1049:40: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope(x, dst, ncols, pos, freq_scale, p_delta_rows, + freq_base, ext_factor, attn_factor, corr_dims, + item_ct1); + }); + } else { + /* + DPCT1049:41: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope(x, dst, ncols, pos, freq_scale, p_delta_rows, + freq_base, ext_factor, attn_factor, corr_dims, + item_ct1); + }); + } +} + +template +static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows, + const int32_t *pos, float freq_scale, + int p_delta_rows, float freq_base, float ext_factor, + float attn_factor, rope_corr_dims corr_dims, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % 2 == 0); + const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, num_blocks_x, nrows); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.0f / n_dims; + + if (pos == nullptr) { + /* + DPCT1049:42: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ncols, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, inv_ndims, + item_ct1); + }); + } else { + /* + DPCT1049:43: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ncols, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, inv_ndims, item_ct1); + }); + } +} + +static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows, + const int32_t *pos, float freq_scale, + int p_delta_rows, float freq_base, int n_ctx, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % 4 == 0); + const sycl::range<3> block_dims(1, 1, CUDA_ROPE_BLOCK_SIZE / 4); + const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; + const sycl::range<3> block_nums(1, nrows, num_blocks_x); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_glm_f32(x, dst, ncols, pos, freq_scale, + p_delta_rows, freq_base, n_ctx, + item_ct1); + }); +} + +static void alibi_f32_cuda(const float *x, float *dst, const int ncols, + const int nrows, const int k_rows, + const int n_heads_log2_floor, const float m0, + const float m1, dpct::queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, CUDA_ALIBI_BLOCK_SIZE); + const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE); + const sycl::range<3> block_nums(1, nrows, num_blocks_x); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + alibi_f32(x, dst, ncols, k_rows, + n_heads_log2_floor, m0, m1, item_ct1); + }); +} + +static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols, + const int nrows, dpct::queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + const sycl::range<3> block_nums(1, nrows, 1); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + k_sum_rows_f32(x, dst, ncols, item_ct1); + }); +} + +static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols, + const int nrows, ggml_sort_order order, + dpct::queue_ptr stream) { + // bitonic sort requires ncols to be power of 2 + GGML_ASSERT((ncols & (ncols - 1)) == 0); + + const sycl::range<3> block_dims(1, 1, ncols); + const sycl::range<3> block_nums(1, nrows, 1); + if (order == GGML_SORT_ASC) { + /* + DPCT1049:44: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32(x, dst, ncols, item_ct1); + }); + } else if (order == GGML_SORT_DESC) { + /* + DPCT1049:45: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32(x, dst, ncols, item_ct1); + }); + } else { + GGML_ASSERT(false); + } +} + +static void diag_mask_inf_f32_cuda(const float *x, float *dst, + const int ncols_x, const int nrows_x, + const int rows_per_channel, const int n_past, + dpct::queue_ptr stream) { + const sycl::range<3> block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); + const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; + const sycl::range<3> block_nums(1, block_num_x, nrows_x); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + diag_mask_inf_f32(x, dst, ncols_x, + rows_per_channel, n_past, + item_ct1); + }); +} + +static void soft_max_f32_cuda(const float *x, const float *y, float *dst, + const int ncols_x, const int nrows_x, + const int nrows_y, const float scale, + dpct::queue_ptr stream) { + int nth = WARP_SIZE; + while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; + const sycl::range<3> block_dims(1, 1, nth); + const sycl::range<3> block_nums(1, 1, nrows_x); + /* + DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the + limit. To get the device limit, query info::device::max_work_group_size. + Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler &cgh) { + /* + DPCT1101:96: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was + replaced with a value. Modify the code to use the original expression, + provided in comments, if it is correct. + */ + sycl::local_accessor buf_acc_ct1( + sycl::range<1>(32 /*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1, + buf_acc_ct1.get_pointer()); + }); + }); +} + +static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH, + int OW, int OH, int KW, int KH, int IC, + int offset_delta, int s0, int s1, int p0, + int p1, int d0, int d1, + dpct::queue_ptr stream) { + const int parallel_elements = OW * KW * KH; + const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; + sycl::range<3> block_nums(IC, OH, num_blocks); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * + sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE), + sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH, + parallel_elements, (IC * KH * KW), s0, s1, p0, + p1, d0, d1, item_ct1); + }); + } +} + +// buffer pool for cuda +#define MAX_CUDA_BUFFERS 256 + +struct scoped_spin_lock { + std::atomic_flag& lock; + scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { + while (lock.test_and_set(std::memory_order_acquire)) { + ; // spin + } + } + ~scoped_spin_lock() { + lock.clear(std::memory_order_release); + } + scoped_spin_lock(const scoped_spin_lock&) = delete; + scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; +}; + +static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; + +// #define DEBUG_CUDA_MALLOC +struct cuda_buffer { + void * ptr = nullptr; + size_t size = 0; +}; + +static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS]; +static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0}; + +static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); +#ifdef DEBUG_CUDA_MALLOC + int nnz = 0; + size_t max_size = 0; +#endif + size_t best_diff = 1ull << 36; + int ibest = -1; + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[id][i]; + if (b.ptr != nullptr) { +#ifdef DEBUG_CUDA_MALLOC + ++nnz; + if (b.size > max_size) max_size = b.size; +#endif + if (b.size >= size) { + size_t diff = b.size - size; + if (diff < best_diff) { + best_diff = diff; + ibest = i; + if (!best_diff) { + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + } + } + } + if (ibest >= 0) { + cuda_buffer& b = g_cuda_buffer_pool[id][ibest]; + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + void * ptr; + size_t look_ahead_size = (size_t) (1.05 * size); + look_ahead_size = 256 * ((look_ahead_size + 255)/256); + CUDA_CHECK( + DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device( + look_ahead_size, dpct::get_in_order_queue()))); + *actual_size = look_ahead_size; + g_cuda_pool_size[id] += look_ahead_size; +#ifdef DEBUG_CUDA_MALLOC + fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz, + (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024)); +#endif + return ptr; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { + cuda_buffer& b = g_cuda_buffer_pool[id][i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue()))); + g_cuda_pool_size[id] -= size; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +#if !defined(GGML_USE_HIPBLAS) +// pool with virtual memory +/* +DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported. +*/ +static std::vector + g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES]; +static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0}; +static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0}; +static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB + +static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + + // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types + const size_t alignment = 128; + size = alignment * ((size + alignment - 1) / alignment); + + size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id]; + + if (size > avail) { + // round up to the next multiple of the granularity + size_t reserve_size = size - avail; + const size_t granularity = g_device_caps[id].vmm_granularity; + reserve_size = granularity * ((reserve_size + granularity - 1) / granularity); + + GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE); + + // allocate more physical memory + /* + DPCT1082:65: Migration of CUmemAllocationProp type is not supported. + */ + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = id; + /* + DPCT1082:66: Migration of CUmemGenericAllocationHandle type is not + supported. + */ + CUmemGenericAllocationHandle handle; + /* + DPCT1007:69: Migration of cuMemCreate is not supported. + */ + CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0)); + + // reserve virtual address space (if not already reserved) + if (g_cuda_pool_addr[id] == 0) { + /* + DPCT1007:70: Migration of cuMemAddressReserve is not supported. + */ + CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], + CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); + } + + // map at the end of the pool + /* + DPCT1007:71: Migration of cuMemMap is not supported. + */ + CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id], + reserve_size, 0, handle, 0)); + + // set access + /* + DPCT1082:72: Migration of CUmemAccessDesc type is not supported. + */ + CUmemAccessDesc access = {}; + access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access.location.id = id; + access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + /* + DPCT1007:73: Migration of cuMemSetAccess is not supported. + */ + CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id], + reserve_size, &access, 1)); + + // add to the pool + g_cuda_pool_handles[id].push_back(handle); + g_cuda_pool_size[id] += reserve_size; + + //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n", + // id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024), + // (unsigned long long) (reserve_size/1024/1024)); + } + + GGML_ASSERT(g_cuda_pool_addr[id] != 0); + + void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]); + *actual_size = size; + g_cuda_pool_used[id] += size; + +#ifdef DEBUG_CUDA_MALLOC + printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr); +#endif + + return ptr; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try { + scoped_spin_lock lock(g_cuda_pool_lock); + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + +#ifdef DEBUG_CUDA_MALLOC + printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr); +#endif + + g_cuda_pool_used[id] -= size; + + // all deallocations must be in reverse order of the allocations + GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id])); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try { + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + if (g_device_caps[id].vmm) { + return ggml_cuda_pool_malloc_vmm(size, actual_size); + } else { + return ggml_cuda_pool_malloc_leg(size, actual_size); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_pool_free(void *ptr, size_t size) try { + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + if (g_device_caps[id].vmm) { + ggml_cuda_pool_free_vmm(ptr, size); + } else { + ggml_cuda_pool_free_leg(ptr, size); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} +#else +#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg +#define ggml_cuda_pool_free ggml_cuda_pool_free_leg +#endif // !defined(GGML_USE_HIPBLAS) + +template +struct cuda_pool_alloc { + T * ptr = nullptr; + size_t actual_size = 0; + + // size is in number of elements + T * alloc(size_t size) { + GGML_ASSERT(ptr == nullptr); + ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size); + return ptr; + } + + cuda_pool_alloc(size_t size) { + alloc(size); + } + + ~cuda_pool_alloc() { + if (ptr != nullptr) { + ggml_cuda_pool_free(ptr, actual_size); + } + } + + T * get() { + return ptr; + } + + cuda_pool_alloc() = default; + cuda_pool_alloc(const cuda_pool_alloc &) = delete; + cuda_pool_alloc(cuda_pool_alloc &&) = delete; + cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete; + cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete; +}; + +static bool g_cublas_loaded = false; + +bool ggml_cublas_loaded(void) { + return g_cublas_loaded; +} + +void ggml_init_cublas() try { + static bool initialized = false; + + if (!initialized) { + +#ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: + // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + + if (DPCT_CHECK_ERROR(g_device_count = + dpct::dev_mgr::instance().device_count()) != + 0) { + initialized = true; + g_cublas_loaded = false; + return; + } + + GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); + int64_t total_vram = 0; +#if defined(GGML_CUDA_FORCE_MMQ) + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); +#endif +#if defined(CUDA_USE_TENSOR_CORES) + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__); +#else + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__); +#endif + fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); + for (int id = 0; id < g_device_count; ++id) { + int device_vmm = 0; + +#if !defined(GGML_USE_HIPBLAS) + int device; + CU_CHECK(DPCT_CHECK_ERROR(device = id)); + /* + DPCT1028:74: The cuDeviceGetAttribute was not migrated because + parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is + unsupported. + */ + CU_CHECK(cuDeviceGetAttribute( + &device_vmm, + CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, + device)); + + if (device_vmm) { + /* + DPCT1082:75: Migration of CUmemAllocationProp type is not + supported. + */ + CUmemAllocationProp alloc_prop = {}; + alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + alloc_prop.location.id = id; + /* + DPCT1007:76: Migration of cuMemGetAllocationGranularity is not + supported. + */ + CU_CHECK(cuMemGetAllocationGranularity( + &g_device_caps[id].vmm_granularity, &alloc_prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + } +#endif // !defined(GGML_USE_HIPBLAS) + g_device_caps[id].vmm = !!device_vmm; + + dpct::device_info prop; + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(id)))); + /* + DPCT1005:77: The SYCL device version is different from CUDA Compute + Compatibility. You may need to rewrite this code. + */ + fprintf(stderr, + " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, + prop.get_name(), prop.get_major_version(), + prop.get_minor_version(), device_vmm ? "yes" : "no"); + + g_tensor_split[id] = total_vram; + total_vram += prop.get_global_mem_size(); +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; +#else + /* + DPCT1005:78: The SYCL device version is different from CUDA Compute + Compatibility. You may need to rewrite this code. + */ + g_device_caps[id].cc = + 100 * prop.get_major_version() + 10 * prop.get_minor_version(); +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + } + for (int id = 0; id < g_device_count; ++id) { + g_tensor_split[id] /= total_vram; + } + + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + + // create cuda streams + for (int is = 0; is < MAX_STREAMS; ++is) { + /* + DPCT1025:79: The SYCL queue is created ignoring the flag and + priority options. + */ + CUDA_CHECK(DPCT_CHECK_ERROR( + g_cudaStreams[id][is] = + dpct::get_current_device().create_queue())); + } + + // create cublas handle + CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = + &dpct::get_in_order_queue())); + /* + DPCT1027:80: The call to cublasSetMathMode was replaced with 0 + because this functionality is redundant in SYCL. + */ + CUBLAS_CHECK(0); + } + + // configure logging to stdout + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); + + initialized = true; + g_cublas_loaded = true; + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_set_tensor_split(const float * tensor_split) { + if (tensor_split == nullptr) { + return; + } + bool all_zero = true; + for (int i = 0; i < g_device_count; ++i) { + if (tensor_split[i] != 0.0f) { + all_zero = false; + break; + } + } + if (all_zero) { + return; + } + float split_sum = 0.0f; + for (int i = 0; i < g_device_count; ++i) { + g_tensor_split[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < g_device_count; ++i) { + g_tensor_split[i] /= split_sum; + } +} + +void *ggml_cuda_host_malloc(size_t size) try { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + dpct::err0 err = DPCT_CHECK_ERROR( + ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue())); + /* + DPCT1000:82: Error handling if-stmt was detected but could not be rewritten. + */ + if (err != 0) { + // clear the error + /* + DPCT1026:83: The call to cudaGetLastError was removed because this + functionality is redundant in SYCL. + */ + /* + DPCT1001:81: The statement could not be removed. + */ + fprintf( + stderr, + "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + /* + DPCT1009:84: SYCL uses exceptions to report errors and does not use + the error codes. The original code was commented out and a warning + string was inserted. You need to rewrite this code. + */ + size / 1024.0 / 1024.0, + "cudaGetErrorString is not supported" /*cudaGetErrorString(err)*/); + return nullptr; + } + + return ptr; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_host_free(void *ptr) try { + CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue()))); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst, + const struct ggml_tensor *src, + int64_t i3, int64_t i2, + int64_t i1_low, int64_t i1_high, + dpct::queue_ptr stream) try { + + dpct::memcpy_direction kind; + char * src_ptr; + if (src->backend == GGML_BACKEND_CPU) { + kind = dpct::host_to_device; + src_ptr = (char *) src->data; + } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + kind = dpct::device_to_device; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; + int id; + CUDA_CHECK(DPCT_CHECK_ERROR( + id = dpct::dev_mgr::instance().current_device_id())); + src_ptr = (char *) extra->data_device[id]; + } else { + GGML_ASSERT(false); + } + char * dst_ptr = (char *) dst; + + const int64_t ne0 = src->ne[0]; + const int64_t nb0 = src->nb[0]; + const int64_t nb1 = src->nb[1]; + const int64_t nb2 = src->nb[2]; + const int64_t nb3 = src->nb[3]; + const enum ggml_type type = src->type; + const int64_t ts = ggml_type_size(type); + const int64_t bs = ggml_blck_size(type); + int64_t i1_diff = i1_high - i1_low; + + const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; + if (nb0 == ts && nb1 == ts*ne0/bs) { + return DPCT_CHECK_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1)); + } else if (nb0 == ts) { + return DPCT_CHECK_ERROR( + dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1, + ts * ne0 / bs, i1_diff, kind, *stream)); + } else { + for (int64_t i1 = 0; i1 < i1_diff; i1++) { + const void * rx = (const void *) ((const char *) x + i1*nb1); + void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); + // pretend the row is a matrix with cols=1 + dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy( + rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream)); + /* + DPCT1001:85: The statement could not be removed. + */ + /* + DPCT1000:86: Error handling if-stmt was detected but could not be + rewritten. + */ + if (r != 0) return r; + } + return 0; + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_op_get_rows(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_d, const float *src1_d, + float *dst_d, const dpct::queue_ptr &stream) { + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + + const int32_t * src1_i32 = (const int32_t *) src1_d; + + switch (src0->type) { + case GGML_TYPE_F16: + get_rows_cuda_float(src0, src1, dst, (const sycl::half *)src0_d, + src1_i32, dst_d, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + default: + // TODO: k-quants + fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); + GGML_ASSERT(false); + break; + } +} + +template +inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, + (sycl::half *)dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd, + main_stream); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, + ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } +} + +static void ggml_cuda_op_repeat(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_d, const float *src1_d, + float *dst_d, + const dpct::queue_ptr &main_stream) { + + ggml_cuda_op_bin_bcast>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream); + + (void) src1; + (void) src1_d; +} + +inline void ggml_cuda_op_add(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + + int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 + int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 + // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused + int offset = dst->op_params[3] / 4; // offset in bytes + + acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); + + (void) dst; +} + +inline void ggml_cuda_op_mul(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_cuda_op_div(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); +} + +inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + + leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_group_norm(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int num_groups = dst->op_params[0]; + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); + group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_concat(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream); + } + + (void) src1; + (void) dst; +} + +inline void ggml_cuda_op_upscale(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + const int scale_factor = dst->op_params[0]; + + upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + pad_f32_cuda(src0_dd, dst_dd, + src0->ne[0], src0->ne[1], src0->ne[2], + dst->ne[0], dst->ne[1], dst->ne[2], main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_mul_mat_q( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) try { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into + const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_1: + ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_0: + ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_1: + ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q8_0: + ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q2_K: + ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q3_K: + ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_K: + ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_K: + ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q6_K: + ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddf_i; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static int64_t get_row_rounding(ggml_type type) { + int64_t min_compute_capability = INT_MAX; + int64_t max_compute_capability = INT_MIN; + for (int64_t id = 0; id < g_device_count; ++id) { + if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + if (min_compute_capability > g_device_caps[id].cc) { + min_compute_capability = g_device_caps[id].cc; + } + if (max_compute_capability < g_device_caps[id].cc) { + max_compute_capability = g_device_caps[id].cc; + } + } + } + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return 1; + case GGML_TYPE_Q2_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 32; + case GGML_TYPE_Q3_K: + return min_compute_capability < CC_RDNA2 ? 128 : 64; + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + default: + GGML_ASSERT(false); + } +#else + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= CC_VOLTA ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return max_compute_capability >= CC_VOLTA ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ASSERT(false); + } +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +} + +inline void ggml_cuda_op_mul_mat_vec_q( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) { + + GGML_ASSERT(ggml_nrows(src1) == 1); + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddf_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} + +inline void ggml_cuda_op_dequantize_mul_mat_vec( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) { + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_CUDA_F16 + cuda_pool_alloc src1_dfloat_a; + half * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = src1_dfloat_a.alloc(ne00); + ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00, + ne00, 1, sizeof(float), 0, 0, + ne00, 1, sizeof(half), 0, 0, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_CUDA_F16 + + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + (void) src1; + (void) dst; + (void) src1_ddq_i; + (void) src1_ncols; + (void) src1_padded_row_size; +} + +inline void ggml_cuda_op_mul_mat_cublas( + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) try { + + GGML_ASSERT(src0_dd_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_dd_i != nullptr); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + + int id; + CUDA_CHECK( + DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())); + + // the main device has a larger memory buffer to hold the results from all GPUs + // ldc == nrows of the matrix that cuBLAS writes into + int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + + const int compute_capability = g_device_caps[id].cc; + + if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { + // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 + cuda_pool_alloc src0_as_f16; + if (src0->type != GGML_TYPE_F16) { + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + size_t ne = row_diff*ne00; + src0_as_f16.alloc(ne); + to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream); + } + const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16 + ? (const sycl::half *)src0_dd_i + : src0_as_f16.get(); + + cuda_pool_alloc src1_as_f16; + if (src1->type != GGML_TYPE_F16) { + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + size_t ne = src1_ncols*ne10; + src1_as_f16.alloc(ne); + to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream); + } + const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16 + ? (const sycl::half *)src1_ddf_i + : src1_as_f16.get(); + cuda_pool_alloc dst_f16(row_diff * src1_ncols); + + const sycl::half alpha_f16 = 1.0f; + const sycl::half beta_f16 = 0.0f; + + CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)); + CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm( + *g_cublas_handles[id], oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, + &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, + src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, + dst_f16.get(), dpct::library_data_t::real_half, ldc, + dpct::library_data_t::real_half))); + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream); + } + else { + cuda_pool_alloc src0_ddq_as_f32; + + if (src0->type != GGML_TYPE_F32) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); + GGML_ASSERT(to_fp32_cuda != nullptr); + src0_ddq_as_f32.alloc(row_diff*ne00); + to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream); + } + const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get(); + + const float alpha = 1.0f; + const float beta = 0.0f; + + CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)); + CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm( + *g_cublas_handles[id], oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, + dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00, + src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]), + dst_dd_i, ldc))); + } + + (void) dst; + (void) src1_ddq_i; + (void) src1_padded_row_size; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t nrows = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + // RoPE alteration for extended context + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + const int32_t * pos = nullptr; + if ((mode & 1) == 0) { + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(src1->ne[0] == ne2); + pos = (const int32_t *) src1_dd; + } + + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); + + // compute + if (is_glm) { + GGML_ASSERT(false); + rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream); + } else if (is_neox) { + if (src0->type == GGML_TYPE_F32) { + rope_neox_cuda( + (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_neox_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, + ne00, n_dims, nrows, pos, freq_scale, ne01, + freq_base, ext_factor, attn_factor, corr_dims, + main_stream); + } else { + GGML_ASSERT(false); + } + } else { + if (src0->type == GGML_TYPE_F32) { + rope_cuda( + (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, + nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream); + } else { + GGML_ASSERT(false); + } + } + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t nrows = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + //GGML_ASSERT(ne01 + n_past == ne00); + GGML_ASSERT(n_head == ne02); + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream); + + (void) src1; + (void) src1_dd; +} + +inline void ggml_cuda_op_im2col(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + + const int64_t IC = src1->ne[is_2D ? 2 : 1]; + const int64_t IH = is_2D ? src1->ne[1] : 1; + const int64_t IW = src1->ne[0]; + + const int64_t KH = is_2D ? src0->ne[1] : 1; + const int64_t KW = src0->ne[0]; + + const int64_t OH = is_2D ? dst->ne[2] : 1; + const int64_t OW = dst->ne[1]; + + const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + + im2col_f32_f16_cuda(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, + IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + + (void) src0; + (void) src0_dd; +} + +inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_argsort(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int nrows0 = ggml_nrows(src0); + + const int n_past = ((int32_t *) dst->op_params)[0]; + + diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_soft_max(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows_x = ggml_nrows(src0); + const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1; + + float scale = 1.0f; + memcpy(&scale, dst->op_params, sizeof(float)); + + soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream); + + (void) dst; +} + +inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); + + scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); + /* + DPCT1010:87: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + CUDA_CHECK(0); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const float *src0_dd, + const float *src1_dd, float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float min; + float max; + memcpy(&min, dst->op_params, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); + + clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); + /* + DPCT1010:88: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + CUDA_CHECK(0); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +static void ggml_cuda_op_flatten(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const ggml_cuda_op_flatten_t op) try { + const int64_t nrows0 = ggml_nrows(src0); + + const bool use_src1 = src1 != nullptr; + const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + + // dd = data device + float * src0_ddf = nullptr; + float * src1_ddf = nullptr; + float * dst_ddf = nullptr; + + cuda_pool_alloc src0_f; + cuda_pool_alloc src1_f; + cuda_pool_alloc dst_f; + + ggml_cuda_set_device(g_main_device); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + if (src0_on_device) { + src0_ddf = (float *) src0_extra->data_device[g_main_device]; + } else { + src0_ddf = src0_f.alloc(ggml_nelements(src0)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); + } + + if (use_src1) { + if (src1_on_device) { + src1_ddf = (float *) src1_extra->data_device[g_main_device]; + } else { + src1_ddf = src1_f.alloc(ggml_nelements(src1)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); + } + } + if (dst_on_device) { + dst_ddf = (float *) dst_extra->data_device[g_main_device]; + } else { + dst_ddf = dst_f.alloc(ggml_nelements(dst)); + } + + // do the computation + op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); + /* + DPCT1010:89: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + CUDA_CHECK(0); + + // copy dst to host if necessary + if (!dst_on_device) { + CUDA_CHECK(DPCT_CHECK_ERROR( + main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)))); + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_current_device().queues_wait_and_throw())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_set_peer_access(const int n_tokens) { + static bool peer_access_enabled = false; + + const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE; + + if (peer_access_enabled == enable_peer_access) { + return; + } + +#ifdef NDEBUG + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(cudaDeviceSynchronize()); + } + + for (int id = 0; id < g_device_count; ++id) { + CUDA_CHECK(ggml_cuda_set_device(id)); + + for (int id_other = 0; id_other < g_device_count; ++id_other) { + if (id == id_other) { + continue; + } + if (id != g_main_device && id_other != g_main_device) { + continue; + } + + int can_access_peer; + CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other)); + if (can_access_peer) { + if (enable_peer_access) { + CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0)); + } else { + CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other)); + } + } + } + } +#endif // NDEBUG + + peer_access_enabled = enable_peer_access; +} + +static void ggml_cuda_op_mul_mat(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + ggml_cuda_op_mul_mat_t op, + const bool convert_src1_to_q8_1) try { + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + const int64_t nrows0 = ggml_nrows(src0); + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + const int64_t nrows1 = ggml_nrows(src1); + + GGML_ASSERT(ne03 == ne13); + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + + GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); + + const int64_t i02_divisor = ne12 / ne02; + + const size_t src0_ts = ggml_type_size(src0->type); + const size_t src0_bs = ggml_blck_size(src0->type); + const size_t q8_1_ts = sizeof(block_q8_1); + const size_t q8_1_bs = QK8_1; + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_is_contiguous = ggml_is_contiguous(src0); + const bool src1_is_contiguous = ggml_is_contiguous(src1); + + const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); + + const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + GGML_ASSERT(!(split && ne02 > 1)); + GGML_ASSERT(!(split && ne03 > 1)); + GGML_ASSERT(!(split && ne02 < ne12)); + + // dd = data device + char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr}; + float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float + char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1 + float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr}; + + // as = actual size + size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; + size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0}; + size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0}; + + int64_t row_low[GGML_CUDA_MAX_DEVICES]; + int64_t row_high[GGML_CUDA_MAX_DEVICES]; + + int used_devices = 0; + + for (int64_t id = 0; id < g_device_count; ++id) { + // by default, use all rows + row_low[id] = 0; + row_high[id] = ne01; + + // for multi GPU, get the row boundaries from tensor split + // and round to mul_mat_q tile sizes + if (split) { + const int64_t rounding = get_row_rounding(src0->type); + + if (id != 0) { + row_low[id] = ne01*g_tensor_split[id]; + if (row_low[id] < ne01) { + row_low[id] -= row_low[id] % rounding; + } + } + + if (id != g_device_count - 1) { + row_high[id] = ne01*g_tensor_split[id + 1]; + if (row_high[id] < ne01) { + row_high[id] -= row_high[id] % rounding; + } + } + } + } + + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } + + used_devices++; + + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + + ggml_cuda_set_device(id); + const dpct::queue_ptr stream = g_cudaStreams[id][0]; + + if (src0_on_device && src0_is_contiguous) { + src0_dd[id] = (char *) src0_extra->data_device[id]; + } else { + // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); + src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]); + } + + if (src1_on_device && src1_is_contiguous) { + src1_ddf[id] = (float *) src1_extra->data_device[id]; + } else { + src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]); + } + + if (convert_src1_to_q8_1) { + src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); + + if (src1_on_device && src1_is_contiguous) { + quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); + /* + DPCT1010:90: SYCL uses exceptions to report errors and does not + use the error codes. The call was replaced with 0. You need to + rewrite this code. + */ + CUDA_CHECK(0); + } + } + + if (dst_on_device) { + dst_dd[id] = (float *) dst_extra->data_device[id]; + } else { + const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst); + dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]); + } + } + + // if multiple devices are used they need to wait for the main device + // here an event is recorded that signals that the main device has finished calculating the input data + if (split && used_devices > 1) { + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + /* + DPCT1024:91: The original code returned the error code that was further + consumed by the program logic. This original code was replaced with 0. + You may need to rewrite the program logic consuming the error code. + */ + CUDA_CHECK(DPCT_CHECK_ERROR( + *src0_extra->events[g_main_device][0] = + g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier())); + } + + const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; + for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { + const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; + const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; + + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } + + const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const int64_t row_diff = row_high[id] - row_low[id]; + + ggml_cuda_set_device(id); + const dpct::queue_ptr stream = g_cudaStreams[id][is]; + + // wait for main GPU data if necessary + if (split && (id != g_main_device || is != 0)) { + CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier( + {*src0_extra->events[g_main_device][0]}))); + } + + for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { + const int64_t i03 = i0 / ne12; + const int64_t i02 = i0 % ne12; + + const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; + + // for split tensors the data begins at i0 == i0_offset_low + char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; + float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10; + char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset; + float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); + + // the main device memory buffer can be on VRAM scratch, with space for all partial results + // in that case an offset on dst_ddf_i is needed + if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { + dst_dd_i += row_low[id]; // offset is 0 if no tensor split + } + + // copy src0, src1 to device if necessary + if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (id != g_main_device) { + if (convert_src1_to_q8_1) { + char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset; + CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy( + src1_ddq_i, src1_ddq_i_source, + src1_ncols * src1_padded_col_size * q8_1_ts / + q8_1_bs))); + } else { + float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; + src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; + CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy( + src1_ddf_i, src1_ddf_i_source, + src1_ncols * ne10 * sizeof(float)))); + } + } + } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d( + src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); + } else { + GGML_ASSERT(false); + } + + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { + quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); + /* + DPCT1010:92: SYCL uses exceptions to report errors and does + not use the error codes. The call was replaced with 0. You + need to rewrite this code. + */ + CUDA_CHECK(0); + } + + if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream)); + } + + // do the computation + op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, + row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream); + /* + DPCT1010:93: SYCL uses exceptions to report errors and does not + use the error codes. The call was replaced with 0. You need to + rewrite this code. + */ + CUDA_CHECK(0); + + // copy dst to host or other device if necessary + if (!dst_on_device) { + void * dst_off_device; + dpct::memcpy_direction kind; + if (dst->backend == GGML_BACKEND_CPU) { + dst_off_device = dst->data; + kind = dpct::device_to_host; + } else if (dst->backend == GGML_BACKEND_GPU) { + dst_off_device = dst_extra->data_device[g_main_device]; + kind = dpct::device_to_device; + } else { + GGML_ASSERT(false); + } + if (split) { + // src0 = weight matrix is saved as a transposed matrix for better memory layout. + // dst is NOT transposed. + // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU. + // Instead they need to be copied to the correct slice in ne0 = dst row index. + // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0 + row_low[id]; + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy( + dhf_dst_i, ne0 * sizeof(float), dst_dd_i, + row_diff * sizeof(float), row_diff * sizeof(float), + src1_ncols, kind, *stream))); + } else { + float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); + GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); + dhf_dst_i += src1_col_0*ne0; + CUDA_CHECK(DPCT_CHECK_ERROR( + stream->memcpy(dhf_dst_i, dst_dd_i, + src1_ncols * ne0 * sizeof(float)))); + } + } + + // add event for the main device to wait on until other device is done + if (split && (id != g_main_device || is != 0)) { + /* + DPCT1024:94: The original code returned the error code that + was further consumed by the program logic. This original + code was replaced with 0. You may need to rewrite the + program logic consuming the error code. + */ + CUDA_CHECK(DPCT_CHECK_ERROR( + *src0_extra->events[id][is] = + stream->ext_oneapi_submit_barrier())); + } + } + } + } + + for (int64_t id = 0; id < g_device_count; ++id) { + if ((!split && id != g_main_device) || row_low[id] == row_high[id]) { + continue; + } + CUDA_CHECK(ggml_cuda_set_device(id)); + + // free buffers again when done + if (dst_as[id] > 0) { + ggml_cuda_pool_free(dst_dd[id], dst_as[id]); + } + if (src1_asq[id] > 0) { + ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]); + } + if (src1_asf[id] > 0) { + ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]); + } + if (src0_as[id] > 0) { + ggml_cuda_pool_free(src0_dd[id], src0_as[id]); + } + } + + // main device waits for all other devices to be finished + if (split && g_device_count > 1) { + int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; + is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + for (int64_t id = 0; id < g_device_count; ++id) { + if (row_low[id] == row_high[id]) { + continue; + } + for (int64_t is = 0; is < is_max; ++is) { + CUDA_CHECK(DPCT_CHECK_ERROR( + g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier( + {*src0_extra->events[id][is]}))); + } + } + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_current_device().queues_wait_and_throw())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat); +} + +static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows); +} + +static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); +} + +static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc); +} + +static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul); +} + +static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div); +} + +static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu); +} + +static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu); +} + +static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick); +} + +static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh); +} + +static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu); +} + +static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu); +} + +static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr); +} + +static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm); +} + +static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm); +} + +static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat); +} + +static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale); +} + +static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad); +} + +static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); +} + +bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + if (!g_cublas_loaded) return false; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); +} + +static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { + GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation + GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + const int64_t ne12 = src1->ne[2]; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + GGML_ASSERT(!ggml_is_permuted(src0)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + + const int64_t ne12 = src1->ne[2]; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + const int64_t row_stride_x = nb01 / sizeof(sycl::half); + const int64_t channel_stride_x = nb02 / sizeof(sycl::half); + + ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void k_compute_batched_ptrs(const sycl::half *src0_as_f16, + const sycl::half *src1_as_f16, char *dst, + const void **ptrs_src, void **ptrs_dst, + int64_t ne12, int64_t ne13, int64_t ne23, + size_t nb02, size_t nb03, size_t nb12, + size_t nb13, size_t nbd2, size_t nbd3, + int64_t r2, int64_t r3, + const sycl::nd_item<3> &item_ct1) { + int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (i13 >= ne13 || i12 >= ne12) { + return; + } + + int64_t i03 = i13 / r3; + int64_t i02 = i12 / r2; + + ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; + ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; +} + +static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00); + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02); + const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03); + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); + const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); + + const int64_t ne1 = ggml_nelements(src1); + const int64_t ne = ggml_nelements(dst); + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + CUBLAS_CHECK( + DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream)); + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + sycl::half *src0_as_f16 = (sycl::half *)src0_ddq; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + // convert src1 to fp16 + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + + cuda_pool_alloc src1_as_f16(ne1); + to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream); + + cuda_pool_alloc dst_f16; + char * dst_t; + + dpct::library_data_t cu_compute_type = CUBLAS_COMPUTE_16F; + dpct::library_data_t cu_data_type = dpct::library_data_t::real_half; + + // dst strides + size_t nbd2 = dst->nb[2]; + size_t nbd3 = dst->nb[3]; + + const sycl::half alpha_f16 = 1.0f; + const sycl::half beta_f16 = 0.0f; + + const float alpha_f32 = 1.0f; + const float beta_f32 = 0.0f; + + const void * alpha = &alpha_f16; + const void * beta = &beta_f16; + + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + dst_t = (char *) dst_f16.alloc(ne); + + nbd2 /= sizeof(float) / sizeof(sycl::half); + nbd3 /= sizeof(float) / sizeof(sycl::half); + } else { + dst_t = (char *) dst_ddf; + + cu_compute_type = CUBLAS_COMPUTE_32F; + cu_data_type = dpct::library_data_t::real_float; + + alpha = &alpha_f32; + beta = &beta_f32; + } + + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + +#if 0 + // use cublasGemmEx + { + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + int i03 = i13 / r3; + int i02 = i12 / r2; + + CUBLAS_CHECK( + cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), + (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), + beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01, + cu_compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } + } +#else + if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) { + // there is no broadcast and src0, src1 are contiguous across dims 2, 3 + // use cublasGemmStridedBatchedEx + CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch( + *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, + (const char *)src0_as_f16, dpct::library_data_t::real_half, + nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half), + (const char *)src1_as_f16.get(), dpct::library_data_t::real_half, + nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta, + (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float), + ne12 * ne13, cu_compute_type))); + } else { + // use cublasGemmBatchedEx + const int ne23 = ne12*ne13; + + cuda_pool_alloc ptrs_src(2*ne23); + cuda_pool_alloc< void *> ptrs_dst(1*ne23); + + sycl::range<3> block_dims(1, ne12, ne13); + /* + DPCT1049:47: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + { + dpct::has_capability_or_fail(main_stream->get_device(), + {sycl::aspect::fp16}); + + main_stream->submit([&](sycl::handler &cgh) { + const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get(); + const void **ptrs_src_get_ct3 = ptrs_src.get(); + void **ptrs_dst_get_ct4 = ptrs_dst.get(); + + cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_compute_batched_ptrs( + src0_as_f16, src1_as_f16_get_ct1, + dst_t, ptrs_src_get_ct3, + ptrs_dst_get_ct4, ne12, ne13, ne23, + nb02, nb03, nb12, nb13, nbd2, nbd3, r2, + r3, item_ct1); + }); + }); + } + /* + DPCT1010:95: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this + code. + */ + CUDA_CHECK(0); + + CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch( + *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, + (const void **)(ptrs_src.get() + 0 * ne23), + dpct::library_data_t::real_half, nb01 / sizeof(sycl::half), + (const void **)(ptrs_src.get() + 1 * ne23), + dpct::library_data_t::real_half, nb11 / sizeof(float), beta, + (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, + cu_compute_type))); + } +#endif + + if (dst->op_params[0] == GGML_PREC_DEFAULT) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const bool all_on_device = + (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && + (src1->backend == GGML_BACKEND_GPU) && + ( dst->backend == GGML_BACKEND_GPU); + + const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + + int64_t min_compute_capability = INT_MAX; + for (int64_t id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + min_compute_capability = g_device_caps[id].cc; + } + } + +#ifdef CUDA_USE_TENSOR_CORES + const bool use_tensor_cores = true; +#else + const bool use_tensor_cores = false; +#endif + + // debug helpers + //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); + //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]); + //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); + //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); + //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); + + if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + // KQ single-batch + ggml_cuda_mul_mat_vec_p021(src0, src1, dst); + } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + // KQV single-batch + ggml_cuda_mul_mat_vec_nc(src0, src1, dst); + } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + // KQ + KQV multi-batch + ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); + } else if (src0->type == GGML_TYPE_F32) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); + } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { + if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { +#ifdef GGML_CUDA_FORCE_DMMV + const bool use_mul_mat_vec_q = false; +#else + const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; +#endif // GGML_CUDA_FORCE_DMMV + + if (use_mul_mat_vec_q) { + // NOTE: this kernel does not support ggml_nrows(src1) > 1 + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); + } else { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + } + } else { + bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type); + + // when tensor cores are available, use them for large batch size + // ref: https://github.com/ggerganov/llama.cpp/pull/3776 + if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) { + use_mul_mat_q = false; + } + + if (use_mul_mat_q) { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); + } else { + ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); + } + } + } else { + GGML_ASSERT(false); + } +} + +#if 0 +template +static __global__ void k_compute_batched_ptrs_id( + const void ** ptrs_src, void ** ptrs_dst, + int ne12, int ne13, + int ne23, + int nb02, int nb03, + int nb12, int nb13, + int nb2, int nb3, + int r2, int r3, + ggml_type src0_type, half * src0_as_f16, int64_t src0_ne, + const half * src1_f16, half * dst_f16, + const int32_t * ids, const int id, + Srcs... src0s) { + + int i = ids[id]; + + half * src0_f16; + const void * srcs_ar[] = { (const half *) src0s... }; + if (src0_type == GGML_TYPE_F16) { + src0_f16 = (half *) srcs_ar[i]; + } else { + src0_f16 = src0_as_f16; + if (threadIdx.x == 0 && threadIdx.y == 0) { + const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type); + to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget); + } + } + + int i13 = blockIdx.x * blockDim.x + threadIdx.x; + int i12 = blockIdx.y * blockDim.y + threadIdx.y; + + if (i13 >= ne13 || i12 >= ne12) { + return; + } + + int i03 = i13 / r3; + int i02 = i12 / r2; + + ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03; + ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2; +} + +static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) { + const struct ggml_tensor * ids = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + const struct ggml_tensor * src00 = dst->src[2]; + + const int id = dst->op_params[0]; + + GGML_ASSERT(!ggml_is_transposed(src00)); + GGML_ASSERT(!ggml_is_transposed(src1)); + + GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); + const int64_t ne01 = src00->ne[1]; + const int64_t ne02 = src00->ne[2]; + const int64_t ne03 = src00->ne[3]; + + //const int64_t nb01 = src00->nb[1]; + const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02); + const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03); + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + //const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); + const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); + + const int64_t ne1 = ggml_nelements(src1); + const int64_t ne = ggml_nelements(dst); + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream)); + + //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + //void * src0_ddq = src0_extra->data_device[g_main_device]; + //half * src0_as_f16 = (half *) src0_ddq; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + // convert src1 to fp16 + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + + size_t src1_as = 0; + half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as); + to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); + + size_t dst_as = 0; + half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const half alpha_f16 = 1.0f; + const half beta_f16 = 0.0f; + + // use cublasGemmBatchedEx + const int ne23 = ne12*ne13; + + const void ** ptrs_src = nullptr; + void ** ptrs_dst = nullptr; + + size_t ptrs_src_s = 0; + size_t ptrs_dst_s = 0; + + ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s); + ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s); + + int64_t src0_ne = ggml_nelements(src00); + half * src0_as_f16 = nullptr; + size_t src0_as = 0; + if (src00->type != GGML_TYPE_F16) { + src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as); + } + + static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6"); + dim3 block_dims(ne13, ne12); + k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>( + ptrs_src, ptrs_dst, + ne12, ne13, + ne23, + ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half), + nb12, nb13, + dst->nb[2], dst->nb[3], + r2, r3, + src00->type, src0_as_f16, src0_ne, + src1_as_f16, dst_f16, + (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id, + dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr, + dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr, + dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr, + dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr + ); + CUDA_CHECK(cudaGetLastError()); + + CUBLAS_CHECK( + cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00, + (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10, + &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01, + ne23, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + if (src0_as != 0) { + ggml_cuda_pool_free(src0_as_f16, src0_as); + } + if (ptrs_src_s != 0) { + ggml_cuda_pool_free(ptrs_src, ptrs_src_s); + } + if (ptrs_dst_s != 0) { + ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s); + } + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + + ggml_cuda_pool_free(src1_as_f16, src1_as); + ggml_cuda_pool_free(dst_f16, dst_as); +} +#endif + +static void ggml_cuda_mul_mat_id(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { +#if 0 + ggml_cuda_mul_mat_id_cublas(dst); + // TODO: mmq/mmv support +#endif + + const int64_t nb11 = src1->nb[1]; + const int64_t nb1 = dst->nb[1]; + + const struct ggml_tensor * ids = src0; + const int32_t id = ((int32_t *) dst->op_params)[0]; + const int32_t n_as = ((int32_t *) dst->op_params)[1]; + + std::vector ids_host(ggml_nbytes(ids)); + + const dpct::queue_ptr stream = g_cudaStreams[g_main_device][0]; + + if (ids->backend == GGML_BACKEND_GPU) { + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + CUDA_CHECK(DPCT_CHECK_ERROR( + stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); + CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait())); + } else { + memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); + } + + const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; + const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; + + ggml_tensor_extra_gpu src1_row_extra; + ggml_tensor_extra_gpu dst_row_extra; + + ggml_tensor src1_row = *src1; + ggml_tensor dst_row = *dst; + + src1_row.backend = GGML_BACKEND_GPU; + dst_row.backend = GGML_BACKEND_GPU; + + src1_row.extra = &src1_row_extra; + dst_row.extra = &dst_row_extra; + + char * src1_original = src1->backend == GGML_BACKEND_CPU ? + (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; + char * dst_original = dst->backend == GGML_BACKEND_CPU ? + (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; + + if (src1->ne[1] == 1) { + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + //int32_t row_id; + //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + + const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; + src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? + + dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; + dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? + + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + } + } else { + cuda_pool_alloc src1_contiguous(sizeof(float)*ggml_nelements(src1)); + cuda_pool_alloc dst_contiguous(sizeof(float)*ggml_nelements(dst)); + + src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); + dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); + + const dpct::memcpy_direction src1_kind = + src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device + : dpct::device_to_device; + const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU + ? dpct::device_to_host + : dpct::device_to_device; + + for (int32_t row_id = 0; row_id < n_as; ++row_id) { + const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + + int64_t num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(DPCT_CHECK_ERROR( + stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11, + src1_original + i01 * nb11, nb11))); + num_src1_rows++; + } + + if (num_src1_rows == 0) { + continue; + } + + src1_row.ne[1] = num_src1_rows; + dst_row.ne[1] = num_src1_rows; + + src1_row.nb[1] = nb11; + src1_row.nb[2] = num_src1_rows*nb11; + src1_row.nb[3] = num_src1_rows*nb11; + + dst_row.nb[1] = nb1; + dst_row.nb[2] = num_src1_rows*nb1; + dst_row.nb[3] = num_src1_rows*nb1; + + ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + + num_src1_rows = 0; + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + + if (row_id_i != row_id) { + continue; + } + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + + CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy( + dst_original + i01 * nb1, + dst_contiguous.get() + num_src1_rows * nb1, nb1))); + num_src1_rows++; + } + } + } + + if (dst->backend == GGML_BACKEND_CPU) { + CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); +} + +static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp); +} + +static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst) try { + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + GGML_ASSERT(src0->ne[3] == 1); + + const int64_t nb00 = src0->nb[0]; + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + GGML_ASSERT(src1->ne[3] == 1); + + const int64_t nb10 = src1->nb[0]; + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0]; + + const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + } else { + fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } + + (void) dst; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + // TODO: why do we pass dst as src1 here? + ggml_cuda_cpy(src0, dst, nullptr); + (void) src1; +} + +static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf); +} + +static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max); +} + +static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope); +} + +static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); +} + +static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col); +} + +static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows); +} + +static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort); +} + +static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + (void) src0; + (void) src1; + (void) dst; +} + +static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); +} + +void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try { + const int64_t nrows = ggml_nrows(tensor); + + const int64_t ne0 = tensor->ne[0]; + + const size_t nb1 = tensor->nb[1]; + + ggml_backend_type backend = tensor->backend; + ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); + + for (int64_t id = 0; id < g_device_count; ++id) { + if (backend == GGML_BACKEND_GPU && id != g_main_device) { + continue; + } + + ggml_cuda_set_device(id); + + int64_t row_low, row_high; + if (backend == GGML_BACKEND_GPU) { + row_low = 0; + row_high = nrows; + } else if (backend == GGML_BACKEND_GPU_SPLIT) { + const int64_t rounding = get_row_rounding(tensor->type); + + row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; + row_low -= row_low % rounding; + + if (id == g_device_count - 1) { + row_high = nrows; + } else { + row_high = nrows*g_tensor_split[id + 1]; + row_high -= row_high % rounding; + } + } else { + GGML_ASSERT(false); + } + if (row_low == row_high) { + continue; + } + + int64_t nrows_split = row_high - row_low; + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf; + CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device( + size, dpct::get_in_order_queue()))); + char * buf_host = (char *)data + offset_split; + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_in_order_queue() + .memset(buf + original_size, 0, size - original_size) + .wait())); + } + + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue() + .memcpy(buf, buf_host, original_size) + .wait())); + + extra->data_device[id] = buf; + + if (backend == GGML_BACKEND_GPU_SPLIT) { + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] = + new sycl::event())); + } + } + } + + tensor->extra = extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_free_data(struct ggml_tensor *tensor) try { + if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { + return; + } + + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + + for (int64_t id = 0; id < g_device_count; ++id) { + if (extra->data_device[id] != nullptr) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free( + extra->data_device[id], dpct::get_in_order_queue()))); + } + + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + if (extra->events[id][is] != nullptr) { + CUDA_CHECK(ggml_cuda_set_device(id)); + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::destroy_event(extra->events[id][is]))); + } + } + } + + delete extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; +static size_t g_temp_tensor_extra_index = 0; + +static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (g_temp_tensor_extras == nullptr) { + g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; + } + + size_t alloc_index = g_temp_tensor_extra_index; + g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; + ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; +} + +static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor, + bool scratch, bool force_inplace, + bool no_alloc) try { + if (scratch && g_scratch_size == 0) { + return; + } + + tensor->backend = GGML_BACKEND_GPU; + + // recursively assign CUDA buffers until a compute tensor is found + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { + const ggml_op src0_op = tensor->src[0]->op; + if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { + ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); + } + } + if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { + ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); + } + + if (scratch && no_alloc) { + return; + } + + ggml_tensor_extra_gpu * extra; + + const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || + tensor->op == GGML_OP_VIEW || + force_inplace; + const size_t size = ggml_nbytes(tensor); + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + size_t offset = 0; + if (tensor->op == GGML_OP_VIEW) { + memcpy(&offset, tensor->op_params, sizeof(size_t)); + } + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = src0_ddc + offset; + } else if (tensor->op == GGML_OP_CPY) { + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; + void * src1_ddv = src1_extra->data_device[g_main_device]; + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = src1_ddv; + } else if (scratch) { + GGML_ASSERT(size <= g_scratch_size); + if (g_scratch_offset + size > g_scratch_size) { + g_scratch_offset = 0; + } + + char * data = (char *) g_scratch_buffer; + if (data == nullptr) { + CUDA_CHECK(DPCT_CHECK_ERROR( + data = (char *)sycl::malloc_device( + g_scratch_size, dpct::get_in_order_queue()))); + g_scratch_buffer = data; + } + extra = ggml_cuda_alloc_temp_tensor_extra(); + extra->data_device[g_main_device] = data + g_scratch_offset; + + g_scratch_offset += size; + + GGML_ASSERT(g_scratch_offset <= g_scratch_size); + } else { // allocate new buffers outside of scratch + void * data; + CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device( + size, dpct::get_in_order_queue()))); + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_in_order_queue().memset(data, 0, size).wait())); + extra = new ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); + extra->data_device[g_main_device] = data; + } + + tensor->extra = extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor, + size_t offset) try { + if (g_scratch_size == 0) { + return; + } + if (g_scratch_buffer == nullptr) { + ggml_cuda_set_device(g_main_device); + CUDA_CHECK( + DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device( + g_scratch_size, dpct::get_in_order_queue()))); + } + + ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); + + const bool inplace = tensor->view_src != nullptr; + + if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + size_t view_offset = 0; + if (tensor->op == GGML_OP_VIEW) { + memcpy(&view_offset, tensor->op_params, sizeof(size_t)); + } + extra->data_device[g_main_device] = src0_ddc + view_offset; + } else { + extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset; + } + + tensor->extra = extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try { + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(ggml_is_contiguous(tensor)); + + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue() + .memcpy(extra->data_device[g_main_device], + tensor->data, ggml_nbytes(tensor)) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, true, false, false); +} + +void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, true, false, true); +} + +void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, false, false, false); +} + +void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { + ggml_cuda_assign_buffers_impl(tensor, false, true, false); +} + +void ggml_cuda_set_main_device(const int main_device) try { + if (main_device >= g_device_count) { + fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", + main_device, g_device_count, g_main_device); + return; + } + + if (g_main_device != main_device && g_device_count > 1) { + g_main_device = main_device; + dpct::device_info prop; + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(g_main_device)))); + fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, + g_main_device, prop.get_name()); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_set_scratch_size(const size_t scratch_size) { + // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously + // it still won't always work as expected, but it's better than nothing + if (scratch_size > g_scratch_size) { + ggml_cuda_free_scratch(); + } + g_scratch_size = std::max(g_scratch_size, scratch_size); +} + +void ggml_cuda_free_scratch() try { + if (g_scratch_buffer == nullptr) { + return; + } + + CUDA_CHECK(DPCT_CHECK_ERROR( + sycl::free(g_scratch_buffer, dpct::get_in_order_queue()))); + g_scratch_buffer = nullptr; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + if (!g_cublas_loaded) return false; + + ggml_cuda_func_t func; + const bool any_on_device = tensor->backend == GGML_BACKEND_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + + if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { + return false; + } + + if (tensor->op == GGML_OP_MUL_MAT) { + if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { +#ifndef NDEBUG + fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); +#endif + return false; + } + } + + switch (tensor->op) { + case GGML_OP_REPEAT: + func = ggml_cuda_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_cuda_get_rows; + break; + case GGML_OP_DUP: + func = ggml_cuda_dup; + break; + case GGML_OP_ADD: + func = ggml_cuda_add; + break; + case GGML_OP_ACC: + func = ggml_cuda_acc; + break; + case GGML_OP_MUL: + func = ggml_cuda_mul; + break; + case GGML_OP_DIV: + func = ggml_cuda_div; + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_GELU: + func = ggml_cuda_gelu; + break; + case GGML_UNARY_OP_SILU: + func = ggml_cuda_silu; + break; + case GGML_UNARY_OP_GELU_QUICK: + func = ggml_cuda_gelu_quick; + break; + case GGML_UNARY_OP_TANH: + func = ggml_cuda_tanh; + break; + case GGML_UNARY_OP_RELU: + func = ggml_cuda_relu; + break; + default: + return false; + } + break; + case GGML_OP_NORM: + func = ggml_cuda_norm; + break; + case GGML_OP_GROUP_NORM: + func = ggml_cuda_group_norm; + break; + case GGML_OP_CONCAT: + func = ggml_cuda_concat; + break; + case GGML_OP_UPSCALE: + func = ggml_cuda_upscale; + break; + case GGML_OP_PAD: + func = ggml_cuda_pad; + break; + case GGML_OP_LEAKY_RELU: + func = ggml_cuda_leaky_relu; + break; + case GGML_OP_RMS_NORM: + func = ggml_cuda_rms_norm; + break; + case GGML_OP_MUL_MAT: + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { + return false; + } + func = ggml_cuda_mul_mat; + break; + case GGML_OP_MUL_MAT_ID: + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) { + return false; + } + func = ggml_cuda_mul_mat_id; + break; + case GGML_OP_SCALE: + func = ggml_cuda_scale; + break; + case GGML_OP_SQR: + func = ggml_cuda_sqr; + break; + case GGML_OP_CLAMP: + func = ggml_cuda_clamp; + break; + case GGML_OP_CPY: + func = ggml_cuda_cpy; + break; + case GGML_OP_CONT: + func = ggml_cuda_dup; + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + func = ggml_cuda_nop; + break; + case GGML_OP_DIAG_MASK_INF: + func = ggml_cuda_diag_mask_inf; + break; + case GGML_OP_SOFT_MAX: + func = ggml_cuda_soft_max; + break; + case GGML_OP_ROPE: + func = ggml_cuda_rope; + break; + case GGML_OP_ALIBI: + func = ggml_cuda_alibi; + break; + case GGML_OP_IM2COL: + func = ggml_cuda_im2col; + break; + case GGML_OP_SUM_ROWS: + func = ggml_cuda_sum_rows; + break; + case GGML_OP_ARGSORT: + func = ggml_cuda_argsort; + break; + default: + return false; + } + + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); + } + + if (params->ith != 0) { + return true; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return true; + } + func(tensor->src[0], tensor->src[1], tensor); + return true; +} + +int ggml_cuda_get_device_count() try { + int device_count; + if (DPCT_CHECK_ERROR(device_count = + dpct::dev_mgr::instance().device_count()) != 0) { + return 0; + } + return device_count; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_cuda_get_device_description(int device, char *description, + size_t description_size) try { + dpct::device_info prop; + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(device)))); + snprintf(description, description_size, "%s", prop.get_name()); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +#define UNUSED GGML_UNUSED + +// cuda buffer + +struct ggml_backend_buffer_context_cuda { + int device; + void * dev_ptr = nullptr; + ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; + size_t temp_tensor_extra_index = 0; + + ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} + + ~ggml_backend_buffer_context_cuda() { + delete[] temp_tensor_extras; + } + + ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (temp_tensor_extras == nullptr) { + temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; + } + + size_t alloc_index = temp_tensor_extra_index; + temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; + ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; + } +}; + +static void +ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + CUDA_CHECK( + DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue()))); + delete ctx; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + return ctx->dev_ptr; +} + +static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor) try { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + if (tensor->view_src != NULL && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->buft == buffer->buft); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + + ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra(); + + extra->data_device[ctx->device] = tensor->data; + + tensor->backend = GGML_BACKEND_GPU; + tensor->extra = extra; + + if (ggml_is_quantized(tensor->type)) { + // initialize padding to 0 to avoid possible NaN values + int64_t row_low = 0; + int64_t row_high = ggml_nrows(tensor); + int64_t nrows_split = row_high - row_low; + + size_t original_size = ggml_nbytes_split(tensor, nrows_split); + size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + + if (padded_size > original_size && tensor->view_src == nullptr) { + CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset( + (char *)tensor->data + original_size, 0, + padded_size - original_size))); + } + } + + UNUSED(buffer); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor, + const void *data, size_t offset, + size_t size) try { + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK( + DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())); + + CUDA_CHECK( + DPCT_CHECK_ERROR(dpct::get_in_order_queue() + .memcpy((char *)tensor->data + offset, data, size) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor *tensor, + void *data, size_t offset, + size_t size) try { + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK( + DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())); + + CUDA_CHECK(DPCT_CHECK_ERROR( + dpct::get_in_order_queue() + .memcpy(data, (const char *)tensor->data + offset, size) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, + uint8_t value) try { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK( + DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())); + + CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue() + .memset(ctx->dev_ptr, value, buffer->size) + .wait())); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { + /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, + /* .cpy_tensor_from = */ NULL, + /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_cuda_buffer_clear, +}; + +// cuda buffer type + +static ggml_backend_buffer_t +ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) try { + int device = (int) (intptr_t) buft->context; + + ggml_cuda_set_device(device); + + size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 + + void * dev_ptr; + CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device( + size, dpct::get_in_order_queue()))); + + ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr); + + return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + + UNUSED(buft); +} + +static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { + int64_t row_low = 0; + int64_t row_high = ggml_nrows(tensor); + int64_t nrows_split = row_high - row_low; + + size_t size = ggml_nbytes_split(tensor, nrows_split); + + int64_t ne0 = tensor->ne[0]; + + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return size; + + UNUSED(buft); +} + +static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_cuda(backend); + + UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { + /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, + /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, + /* .is_host = */ nullptr, +}; + +ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + + static bool ggml_backend_cuda_buffer_type_initialized = false; + + if (!ggml_backend_cuda_buffer_type_initialized) { + for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { + ggml_backend_cuda_buffer_types[i] = { + /* .iface = */ ggml_backend_cuda_buffer_type_interface, + /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, + }; + } + ggml_backend_cuda_buffer_type_initialized = true; + } + + return &ggml_backend_cuda_buffer_types[device]; +} + +// host buffer type + +static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_cuda_host_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_cuda_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + // FIXME: this is a hack to avoid having to implement a new buffer type + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_cuda_buffer_type_host; +} + +// backend + +struct ggml_backend_context_cuda { + int device; +}; + +static const char * ggml_backend_cuda_name(ggml_backend_t backend) { + return GGML_CUDA_NAME; + + UNUSED(backend); +} + +static void ggml_backend_cuda_free(ggml_backend_t backend) { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + delete cuda_ctx; + delete backend; +} + +static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + return ggml_backend_cuda_buffer_type(cuda_ctx->device); +} + +static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, + ggml_tensor *tensor, + const void *data, size_t offset, + size_t size) try { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy( + (char *)tensor->data + offset, data, size))); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, + const ggml_tensor *tensor, + void *data, size_t offset, + size_t size) try { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy( + data, (const char *)tensor->data + offset, size))); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait())); + + UNUSED(backend); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { + GGML_ASSERT(!"not implemented"); + + return nullptr; + + UNUSED(backend); + UNUSED(cgraph); +} + +static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + GGML_ASSERT(!"not implemented"); + + UNUSED(backend); + UNUSED(plan); +} + +static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + GGML_ASSERT(!"not implemented"); + + UNUSED(backend); + UNUSED(plan); +} + +static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + + ggml_cuda_set_main_device(cuda_ctx->device); + + ggml_compute_params params = {}; + params.type = GGML_TASK_COMPUTE; + params.ith = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) + continue; + + assert(node->backend == GGML_BACKEND_GPU); + assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + assert(node->extra != nullptr); + + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] != nullptr) { + assert(node->src[j]->backend == GGML_BACKEND_GPU); + assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + assert(node->src[j]->extra != nullptr); + } + } + + bool ok = ggml_cuda_compute_forward(¶ms, node); + if (!ok) { + fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + GGML_ASSERT(ok); + +#if 0 + if (node->type == GGML_TYPE_F32) { + cudaDeviceSynchronize(); + std::vector tmp(ggml_nelements(node), 0.0f); + cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); + printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), + ggml_type_name(node->src[0]->type), + node->src[1] ? ggml_type_name(node->src[1]->type) : "none", + node->src[0]->name, + node->src[1] ? node->src[1]->name : "none"); + double sum = 0.0; + double sq_sum = 0.0; + for (int i = 0; i < ggml_nelements(node); i++) { + printf("%f ", tmp[i]); + sum += tmp[i]; + sq_sum += tmp[i]*tmp[i]; + } + printf("\n"); + printf("sum: %f, ", sum); + printf("sq_sum: %f\n", sq_sum); + } +#endif + } + + UNUSED(backend); +} + +static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_TANH: + return true; + default: + return false; + } + break; + case GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT_ID: + { + struct ggml_tensor * a; + struct ggml_tensor * b; + if (op->op == GGML_OP_MUL_MAT) { + a = op->src[0]; + b = op->src[1]; + } else { + a = op->src[2]; + b = op->src[1]; + } + if (a->ne[3] != b->ne[3]) { + return false; + } + return true; + } break; + case GGML_OP_GET_ROWS: + { + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } break; + case GGML_OP_CPY: + { + ggml_type src0_type = op->src[0]->type; + ggml_type src1_type = op->src[1]->type; + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) { + return true; + } + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) { + return true; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return true; + } + return false; + } break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + case GGML_OP_REPEAT: + case GGML_OP_DUP: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_CONT: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + case GGML_OP_ALIBI: + case GGML_OP_IM2COL: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_CONCAT: + case GGML_OP_GROUP_NORM: + case GGML_OP_UPSCALE: + case GGML_OP_PAD: + case GGML_OP_LEAKY_RELU: + return true; + default: + return false; + } + + UNUSED(backend); +} + +static ggml_backend_i cuda_backend_i = { + /* .get_name = */ ggml_backend_cuda_name, + /* .free = */ ggml_backend_cuda_free, + /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, + /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ ggml_backend_cuda_synchronize, + /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, + /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, + /* .graph_compute = */ ggml_backend_cuda_graph_compute, + /* .supports_op = */ ggml_backend_cuda_supports_op, +}; + +ggml_backend_t ggml_backend_cuda_init(int device) { + ggml_init_cublas(); // TODO: remove from ggml.c + + if (device < 0 || device >= ggml_cuda_get_device_count()) { + fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); + return nullptr; + } + + // not strictly necessary, but it may reduce the overhead of the first graph_compute + ggml_cuda_set_main_device(device); + + ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda { + /* .device = */ device + }; + + ggml_backend_t cuda_backend = new ggml_backend { + /* .interface = */ cuda_backend_i, + /* .context = */ ctx + }; + + return cuda_backend; +} + +bool ggml_backend_is_cuda(ggml_backend_t backend) { + return backend->iface.get_name == ggml_backend_cuda_name; +} + +static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { + ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); + return cuda_backend; + + UNUSED(params); +} + +extern "C" int ggml_backend_cuda_reg_devices(); + +int ggml_backend_cuda_reg_devices() { + int device_count = ggml_cuda_get_device_count(); + //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization + for (int i = 0; i < device_count; i++) { + char name[128]; + snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i); + ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i); + } + return device_count; +} diff --git a/dpcpp_out2/ggml-cuda.h b/dpcpp_out2/ggml-cuda.h new file mode 100644 index 0000000000000..cdb0c0c41618a --- /dev/null +++ b/dpcpp_out2/ggml-cuda.h @@ -0,0 +1,64 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_HIPBLAS +#define GGML_CUDA_NAME "ROCm" +#define GGML_CUBLAS_NAME "hipBLAS" +#else +#define GGML_CUDA_NAME "CUDA" +#define GGML_CUBLAS_NAME "cuBLAS" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_CUDA_MAX_DEVICES 16 + +// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`. +GGML_API void ggml_init_cublas(void); + +// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. +GGML_API bool ggml_cublas_loaded(void); + +GGML_API void * ggml_cuda_host_malloc(size_t size); +GGML_API void ggml_cuda_host_free(void * ptr); + +GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); +GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); +GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); + +GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); + +GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); +GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor); + +GGML_API void ggml_cuda_set_main_device(int main_device); +GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); +GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size); +GGML_API void ggml_cuda_free_scratch(void); +GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + +GGML_API int ggml_cuda_get_device_count(void); +GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); + +// backend API +GGML_API ggml_backend_t ggml_backend_cuda_init(int device); + +GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); +GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend); + +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); + +// pinned host buffer for use with CPU backend for faster copies between CPU and GPU +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); + +#ifdef __cplusplus +} +#endif diff --git a/dpcpp_out2/ggml.h b/dpcpp_out2/ggml.h new file mode 100644 index 0000000000000..5bb5323434e01 --- /dev/null +++ b/dpcpp_out2/ggml.h @@ -0,0 +1,2253 @@ +#pragma once + +// +// GGML Tensor Library +// +// This documentation is still a work in progress. +// If you wish some specific topics to be covered, feel free to drop a comment: +// +// https://github.com/ggerganov/whisper.cpp/issues/40 +// +// ## Overview +// +// This library implements: +// +// - a set of tensor operations +// - automatic differentiation +// - basic optimization algorithms +// +// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, +// but is not limited to, the following: +// +// - linear regression +// - support vector machines +// - neural networks +// +// The library allows the user to define a certain function using the available tensor operations. This function +// definition is represented internally via a computation graph. Each tensor operation in the function definition +// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the +// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized +// using one of the available optimization algorithms. +// +// For example, here we define the function: f(x) = a*x^2 + b +// +// { +// struct ggml_init_params params = { +// .mem_size = 16*1024*1024, +// .mem_buffer = NULL, +// }; +// +// // memory allocation happens here +// struct ggml_context * ctx = ggml_init(params); +// +// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// +// ggml_set_param(ctx, x); // x is an input variable +// +// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * x2 = ggml_mul(ctx, x, x); +// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); +// +// ... +// } +// +// Notice that the function definition above does not involve any actual computation. The computation is performed only +// when the user explicitly requests it. For example, to compute the function's value at x = 2.0: +// +// { +// ... +// +// struct ggml_cgraph * gf = ggml_new_graph(ctx); +// ggml_build_forward_expand(gf, f); +// +// // set the input variable and parameter values +// ggml_set_f32(x, 2.0f); +// ggml_set_f32(a, 3.0f); +// ggml_set_f32(b, 4.0f); +// +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); +// +// printf("f = %f\n", ggml_get_f32_1d(f, 0)); +// +// ... +// } +// +// The actual computation is performed in the ggml_graph_compute() function. +// +// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the +// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know +// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory +// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was +// actually needed. +// +// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic +// differentiation and optimization algorithms. +// +// The described approach allows to define the function graph once and then compute its forward or backward graphs +// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way +// the user can avoid the memory allocation overhead at runtime. +// +// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class +// citizens, but in theory the library can be extended to support FP8 and integer data types. +// +// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary +// and binary operations. Most of the available operations fall into one of these two categories. With time, it became +// clear that the library needs to support more complex operations. The way to support these operations is not clear +// yet, but a few examples are demonstrated in the following operations: +// +// - ggml_permute() +// - ggml_conv_1d_1s() +// - ggml_conv_1d_2s() +// +// For each tensor operator, the library implements a forward and backward computation function. The forward function +// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the +// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a +// calculus class, or watch the following video: +// +// What is Automatic Differentiation? +// https://www.youtube.com/watch?v=wG_nF1awSSY +// +// +// ## Tensor data (struct ggml_tensor) +// +// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of +// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains +// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: +// +// { +// struct ggml_tensor * c = ggml_add(ctx, a, b); +// +// assert(c->src[0] == a); +// assert(c->src[1] == b); +// } +// +// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the +// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows +// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and +// permutation. All tensor operations have to take the stride into account and not assume that the tensor is +// contiguous in memory. +// +// The data of the tensor is accessed via the "data" pointer. For example: +// +// { +// const int nx = 2; +// const int ny = 3; +// +// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); +// +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y; +// } +// } +// +// ... +// } +// +// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. +// +// ## The matrix multiplication operator (ggml_mul_mat) +// +// TODO +// +// +// ## Multi-threading +// +// TODO +// +// +// ## Overview of ggml.c +// +// TODO +// +// +// ## SIMD optimizations +// +// TODO +// +// +// ## Debugging ggml +// +// TODO +// +// + +#ifdef GGML_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BUILD +# define GGML_API __declspec(dllexport) +# else +# define GGML_API __declspec(dllimport) +# endif +# else +# define GGML_API __attribute__ ((visibility ("default"))) +# endif +#else +# define GGML_API +#endif + +// TODO: support for clang +#ifdef __GNUC__ +# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define GGML_DEPRECATED(func, hint) func +#endif + +#ifndef __GNUC__ +# define GGML_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif + +#define DPCT_PROFILING_ENABLED +#include +#include +#include +#include +#include + +#define GGML_FILE_MAGIC 0x67676d6c // "ggml" +#define GGML_FILE_VERSION 1 + +#define GGML_QNT_VERSION 2 // bump this on quantization format changes +#define GGML_QNT_VERSION_FACTOR 1000 // do not change this + +#define GGML_MAX_DIMS 4 +#define GGML_MAX_PARAMS 2048 +#define GGML_MAX_CONTEXTS 64 +#define GGML_MAX_SRC 10 +#define GGML_MAX_NAME 64 +#define GGML_MAX_OP_PARAMS 64 +#define GGML_DEFAULT_N_THREADS 4 +#define GGML_DEFAULT_GRAPH_SIZE 2048 +#if UINTPTR_MAX == 0xFFFFFFFF + #define GGML_MEM_ALIGN 4 +#else + #define GGML_MEM_ALIGN 16 +#endif + +#define GGML_EXIT_SUCCESS 0 +#define GGML_EXIT_ABORTED 1 + +#define GGUF_MAGIC "GGUF" + +#define GGUF_VERSION 3 + +#define GGUF_DEFAULT_ALIGNMENT 32 + +#define GGML_UNUSED(x) (void)(x) + +#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) + +#define GGML_ASSERT(x) \ + do { \ + if (!(x)) { \ + fflush(stdout); \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + ggml_print_backtrace(); \ + abort(); \ + } \ + } while (0) + +#ifndef NDEBUG +#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached") +#elif defined(__GNUC__) +#define GGML_UNREACHABLE() __builtin_unreachable() +#elif defined(_MSC_VER) +#define GGML_UNREACHABLE() __assume(0) +#else +#define GGML_UNREACHABLE() ((void) 0) +#endif + +// used to copy the number of elements and stride in bytes of tensors into local variables. +// main purpose is to reduce code duplication and improve readability. +// +// example: +// +// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); +// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); +// +#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ + const type prefix##0 = (pointer)->array[0]; \ + GGML_UNUSED(prefix##0); +#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ + const type prefix##1 = (pointer)->array[1]; \ + GGML_UNUSED(prefix##1); +#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ + const type prefix##2 = (pointer)->array[2]; \ + GGML_UNUSED(prefix##2); +#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ + const type prefix##3 = (pointer)->array[3]; \ + GGML_UNUSED(prefix##3); + +#define GGML_TENSOR_UNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__ARM_NEON) && defined(SYCL_LANGUAGE_VERSION) + typedef half ggml_fp16_t; +#elif defined(__ARM_NEON) && !defined(_MSC_VER) + typedef __fp16 ggml_fp16_t; +#else + typedef uint16_t ggml_fp16_t; +#endif + + // convert FP16 <-> FP32 + GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); + GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); + + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n); + GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n); + + struct ggml_object; + struct ggml_context; + + enum ggml_type { + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, + // GGML_TYPE_Q4_2 = 4, support has been removed + // GGML_TYPE_Q4_3 (5) support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + // k-quantizations + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_COUNT, + }; + + // precision + enum ggml_prec { + GGML_PREC_DEFAULT, + GGML_PREC_F32, + }; + + enum ggml_backend_type { + GGML_BACKEND_CPU = 0, + GGML_BACKEND_GPU = 10, + GGML_BACKEND_GPU_SPLIT = 20, + }; + + // model file types + enum ggml_ftype { + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors + GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors + }; + + // available tensor operations: + enum ggml_op { + GGML_OP_NONE = 0, + + GGML_OP_DUP, + GGML_OP_ADD, + GGML_OP_ADD1, + GGML_OP_ACC, + GGML_OP_SUB, + GGML_OP_MUL, + GGML_OP_DIV, + GGML_OP_SQR, + GGML_OP_SQRT, + GGML_OP_LOG, + GGML_OP_SUM, + GGML_OP_SUM_ROWS, + GGML_OP_MEAN, + GGML_OP_ARGMAX, + GGML_OP_REPEAT, + GGML_OP_REPEAT_BACK, + GGML_OP_CONCAT, + GGML_OP_SILU_BACK, + GGML_OP_NORM, // normalize + GGML_OP_RMS_NORM, + GGML_OP_RMS_NORM_BACK, + GGML_OP_GROUP_NORM, + + GGML_OP_MUL_MAT, + GGML_OP_MUL_MAT_ID, + GGML_OP_OUT_PROD, + + GGML_OP_SCALE, + GGML_OP_SET, + GGML_OP_CPY, + GGML_OP_CONT, + GGML_OP_RESHAPE, + GGML_OP_VIEW, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_GET_ROWS_BACK, + GGML_OP_DIAG, + GGML_OP_DIAG_MASK_INF, + GGML_OP_DIAG_MASK_ZERO, + GGML_OP_SOFT_MAX, + GGML_OP_SOFT_MAX_BACK, + GGML_OP_ROPE, + GGML_OP_ROPE_BACK, + GGML_OP_ALIBI, + GGML_OP_CLAMP, + GGML_OP_CONV_TRANSPOSE_1D, + GGML_OP_IM2COL, + GGML_OP_CONV_TRANSPOSE_2D, + GGML_OP_POOL_1D, + GGML_OP_POOL_2D, + GGML_OP_UPSCALE, // nearest interpolate + GGML_OP_PAD, + GGML_OP_ARGSORT, + GGML_OP_LEAKY_RELU, + + GGML_OP_FLASH_ATTN, + GGML_OP_FLASH_FF, + GGML_OP_FLASH_ATTN_BACK, + GGML_OP_WIN_PART, + GGML_OP_WIN_UNPART, + GGML_OP_GET_REL_POS, + GGML_OP_ADD_REL_POS, + + GGML_OP_UNARY, + + GGML_OP_MAP_UNARY, + GGML_OP_MAP_BINARY, + + GGML_OP_MAP_CUSTOM1_F32, + GGML_OP_MAP_CUSTOM2_F32, + GGML_OP_MAP_CUSTOM3_F32, + + GGML_OP_MAP_CUSTOM1, + GGML_OP_MAP_CUSTOM2, + GGML_OP_MAP_CUSTOM3, + + GGML_OP_CROSS_ENTROPY_LOSS, + GGML_OP_CROSS_ENTROPY_LOSS_BACK, + + GGML_OP_COUNT, + }; + + enum ggml_unary_op { + GGML_UNARY_OP_ABS, + GGML_UNARY_OP_SGN, + GGML_UNARY_OP_NEG, + GGML_UNARY_OP_STEP, + GGML_UNARY_OP_TANH, + GGML_UNARY_OP_ELU, + GGML_UNARY_OP_RELU, + GGML_UNARY_OP_GELU, + GGML_UNARY_OP_GELU_QUICK, + GGML_UNARY_OP_SILU, + + GGML_UNARY_OP_COUNT, + }; + + enum ggml_object_type { + GGML_OBJECT_TENSOR, + GGML_OBJECT_GRAPH, + GGML_OBJECT_WORK_BUFFER + }; + + enum ggml_log_level { + GGML_LOG_LEVEL_ERROR = 2, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_INFO = 4, + GGML_LOG_LEVEL_DEBUG = 5 + }; + + // ggml object + struct ggml_object { + size_t offs; + size_t size; + + struct ggml_object * next; + + enum ggml_object_type type; + + char padding[4]; + }; + + static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + + // n-dimensional tensor + struct ggml_tensor { + enum ggml_type type; + enum ggml_backend_type backend; + + struct ggml_backend_buffer * buffer; + + int64_t ne[GGML_MAX_DIMS]; // number of elements + size_t nb[GGML_MAX_DIMS]; // stride in bytes: + // nb[0] = ggml_type_size(type) + // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding + // nb[i] = nb[i-1] * ne[i-1] + + // compute data + enum ggml_op op; + + // op params - allocated as int32_t for alignment + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + + bool is_param; + + struct ggml_tensor * grad; + struct ggml_tensor * src[GGML_MAX_SRC]; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + + struct ggml_tensor * view_src; + size_t view_offs; + + void * data; + + char name[GGML_MAX_NAME]; + + void * extra; // extra things e.g. for ggml-cuda.cu + + char padding[8]; + }; + + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + + // abort ggml_graph_compute when true + bool (*abort_callback)(void * data); + void * abort_callback_data; + }; + + enum ggml_cgraph_eval_order { + GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0, + GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT, + GGML_CGRAPH_EVAL_ORDER_COUNT + }; + + struct ggml_hash_set { + size_t size; + struct ggml_tensor ** keys; + }; + + // computation graph + struct ggml_cgraph { + int size; + int n_nodes; + int n_leafs; + + struct ggml_tensor ** nodes; + struct ggml_tensor ** grads; + struct ggml_tensor ** leafs; + + struct ggml_hash_set visited_hash_table; + + enum ggml_cgraph_eval_order order; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + }; + + // scratch buffer + struct ggml_scratch { + size_t offs; + size_t size; + void * data; + }; + + struct ggml_init_params { + // memory pool + size_t mem_size; // bytes + void * mem_buffer; // if NULL, memory will be allocated internally + bool no_alloc; // don't allocate memory for the tensor data + }; + + + // compute types + + // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. + // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. + enum ggml_task_type { + GGML_TASK_INIT = 0, + GGML_TASK_COMPUTE, + GGML_TASK_FINALIZE, + }; + + struct ggml_compute_params { + enum ggml_task_type type; + + // ith = thread index, nth = number of threads + int ith, nth; + + // work buffer for all threads + size_t wsize; + void * wdata; + }; + + // misc + + GGML_API void ggml_time_init(void); // call this once at the beginning of the program + GGML_API int64_t ggml_time_ms(void); + GGML_API int64_t ggml_time_us(void); + GGML_API int64_t ggml_cycles(void); + GGML_API int64_t ggml_cycles_per_ms(void); + + GGML_API void ggml_print_backtrace(void); + + GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems + GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node + + GGML_API void ggml_print_object (const struct ggml_object * obj); + GGML_API void ggml_print_objects(const struct ggml_context * ctx); + + GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); + GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN + + GGML_API int ggml_blck_size(enum ggml_type type); + GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + + GGML_DEPRECATED( + GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float + "use ggml_row_size() instead"); + + GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_op_symbol(enum ggml_op op); + + GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); + GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name + + GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); + + GGML_API bool ggml_is_quantized(enum ggml_type type); + + // TODO: temporary until model loading of ggml examples is refactored + GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + + GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars + + GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + + // use this to compute the memory overhead of a tensor + GGML_API size_t ggml_tensor_overhead(void); + + // main + + GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); + GGML_API void ggml_free(struct ggml_context * ctx); + + GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); + + GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); + GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); + + GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx); + GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx); + + GGML_API struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t *ne); + + GGML_API struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1); + + GGML_API struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); + GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + + GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); + + // Context tensor enumeration and lookup + GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx); + GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); + GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + + // Converts a flat index into coordinates + GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); + + GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + + GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); + + GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); + + GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); + + GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); + GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); + + GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + + GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); + GGML_ATTRIBUTE_FORMAT(2, 3) + GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); + + // + // operations on tensors with backpropagation + // + + GGML_API struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type); + + GGML_API struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // dst = a + // view(dst, nb1, nb2, nb3, offset) += b + // return dst + GGML_API struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // return scalar + GGML_API struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] + GGML_API struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // mean along rows + GGML_API struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // argmax along rows + GGML_API struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // if a is the same shape as b, and a is not parameter, return a + // otherwise, return a new tensor: repeat(a) to fit in b + GGML_API struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // sums repetitions in a into shape of b + GGML_API struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // concat a and b on dim 2 + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_concat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_leaky_relu( + struct ggml_context * ctx, + struct ggml_tensor * a, float negative_slope, bool inplace); + + GGML_API struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_silu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // a - x + // b - dy + GGML_API struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // normalize along rows + GGML_API struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_rms_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_rms_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + // group normalize along ne0*ne1*n_groups + // used in stable-diffusion + // TODO: eps is hardcoded to 1e-6 for now + GGML_API struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + + GGML_API struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + + // a - x + // b - dy + GGML_API struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float eps); + + // A: k columns, n rows => [ne03, ne02, n, k] + // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k] + // result is n columns, m rows => [ne03 * x, ne02 * y, m, n] + GGML_API struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // change the precision of a matrix multiplication + // set to GGML_PREC_F32 for higher precision (useful for phi-2) + GGML_API void ggml_mul_mat_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec); + + // indirect matrix multiplication + // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) + GGML_API struct ggml_tensor * ggml_mul_mat_id( + struct ggml_context * ctx, + struct ggml_tensor * const as[], + int n_as, + struct ggml_tensor * ids, + int id, + struct ggml_tensor * b); + + // A: m columns, n rows, + // B: p columns, n rows, + // result is m columns, p rows + GGML_API struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // + // operations on tensors without backpropagation + // + + GGML_API struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_API struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_API struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + + GGML_API struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_API struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_API struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); + + // a -> b, return view(b) + GGML_API struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // a -> b, in-place, return view(b) + GGML_API struct ggml_tensor * ggml_cpy_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // make contiguous + GGML_API struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // make contiguous, in-place + GGML_API struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // make contiguous, with new shape + GGML_API struct ggml_tensor * ggml_cont_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_cont_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1); + + GGML_API struct ggml_tensor * ggml_cont_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_cont_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + // return view(a), b specifies the new shape + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1); + + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + // offset in bytes + GGML_API struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + size_t nb1, // row stride in bytes + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3); + + // alias for ggml_permute(ctx, a, 1, 0, 2, 3) + GGML_API struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // supports 3D: a->ne[2] == b->ne[1] + GGML_API struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_get_rows_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + + GGML_API struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // set elements above the diagonal to -INF + GGML_API struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // set elements above the diagonal to 0 + GGML_API struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + GGML_API struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // fused soft_max(a*scale + mask) + // mask is optional + GGML_API struct ggml_tensor * ggml_soft_max_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * mask, + float scale); + + GGML_API struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // rotary position embedding + // if mode & 1 == 1, skip n_past elements (DEPRECATED) + // if mode & 2 == 1, GPT-NeoX style + // if mode & 4 == 1, ChatGLM style + // + // b is an int32 vector with size a->ne[2], it contains the positions + GGML_API struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode, + int n_ctx); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode, + int n_ctx); + + // custom RoPE + GGML_API struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + // compute correction dims for YaRN RoPE scaling + void ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); + + // xPos RoPE, in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + float base, + bool down); + + // rotary position embedding backward, i.e compute dx from dy + // a - dy + GGML_API struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode, + int n_ctx, + int n_orig_ctx, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, + float xpos_base, + bool xpos_down); + + // alibi position embedding + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_alibi( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_head, + float bias_max); + + // clamp + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, + float min, + float max); + + GGML_API struct ggml_tensor * ggml_im2col( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D); + + GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, // stride + int p0, // padding + int d0); // dilation + + // conv_1d with padding = half + // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + GGML_API struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d); + + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0); + + GGML_API struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1); + + + // kernel size is a->ne[0] x a->ne[1] + // stride is equal to kernel size + // padding is zero + // example: + // a: 16 16 3 768 + // b: 1024 1024 3 1 + // res: 64 64 768 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // kernel size is a->ne[0] x a->ne[1] + // stride is 1 + // padding is half + // example: + // a: 3 3 256 256 + // b: 64 64 256 1 + // res: 64 64 256 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride); + + enum ggml_op_pool { + GGML_OP_POOL_MAX, + GGML_OP_POOL_AVG, + GGML_OP_POOL_COUNT, + }; + + GGML_API struct ggml_tensor * ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, // kernel size + int s0, // stride + int p0); // padding + + // the result will have 2*p0 padding for the first dimension + // and 2*p1 padding for the second dimension + GGML_API struct ggml_tensor * ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + float p0, + float p1); + + // nearest interpolate + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor); + + // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0] + GGML_API struct ggml_tensor * ggml_pad( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1, + int p2, + int p3); + + // sort rows + enum ggml_sort_order { + GGML_SORT_ASC, + GGML_SORT_DESC, + }; + + GGML_API struct ggml_tensor * ggml_argsort( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_sort_order order); + + // top k elements per row + GGML_API struct ggml_tensor * ggml_top_k( + struct ggml_context * ctx, + struct ggml_tensor * a, + int k); + + GGML_API struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + bool masked); + + GGML_API struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked); + + GGML_API struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1); + + // partition into non-overlapping windows with padding if needed + // example: + // a: 768 64 64 1 + // w: 14 + // res: 768 14 14 25 + // used in sam + GGML_API struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w); + + // reverse of ggml_win_part + // used in sam + GGML_API struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w0, + int h0, + int w); + + GGML_API struct ggml_tensor * ggml_unary( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + + GGML_API struct ggml_tensor * ggml_unary_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + + // used in sam + GGML_API struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh); + + // used in sam + GGML_API struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + // custom operators + + typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); + typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + + typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun), + "use ggml_map_custom1 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun), + "use ggml_map_custom2 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3_inplace instead"); + + // custom operators v2 + + typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); + typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); + typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); + + #define GGML_N_TASKS_MAX -1 + + GGML_API struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + // loss function + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); + + // + // automatic differentiation + // + + GGML_API void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor); + + + GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); + + // graph allocation in a context + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false + GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads); + GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); + GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1); + GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads + GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); + + GGML_API size_t ggml_graph_overhead(void); + GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); + + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + + GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); + + GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); + GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); + + // print info and performance information for the graph + GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); + + // dump the graph into a file using the dot format + GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); + + // build gradient checkpointing backward graph gb for gf using provided checkpoints + // gb_tmp will contain original backward graph with rewritten backward process nodes, + // but without the second forward pass nodes. + GGML_API void ggml_build_backward_gradient_checkpointing( + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, + struct ggml_tensor * * checkpoints, + int n_checkpoints); + // + // optimization + // + + // optimization methods + enum ggml_opt_type { + GGML_OPT_ADAM, + GGML_OPT_LBFGS, + }; + + // linesearch methods + enum ggml_linesearch { + GGML_LINESEARCH_DEFAULT = 1, + + GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, + GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, + GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, + }; + + // optimization return values + enum ggml_opt_result { + GGML_OPT_OK = 0, + GGML_OPT_DID_NOT_CONVERGE, + GGML_OPT_NO_CONTEXT, + GGML_OPT_INVALID_WOLFE, + GGML_OPT_FAIL, + GGML_OPT_CANCEL, + + GGML_LINESEARCH_FAIL = -128, + GGML_LINESEARCH_MINIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_ITERATIONS, + GGML_LINESEARCH_INVALID_PARAMETERS, + }; + + typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel); + typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); + + // optimization parameters + // + // see ggml.c (ggml_opt_default_params) for default values + // + struct ggml_opt_params { + enum ggml_opt_type type; + + size_t graph_size; + + int n_threads; + + // delta-based convergence test + // + // if past == 0 - disabled + // if past > 0: + // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) + // + int past; + float delta; + + // maximum number of iterations without improvement + // + // if 0 - disabled + // if > 0: + // assume convergence if no cost improvement in this number of iterations + // + int max_no_improvement; + + bool print_forward_graph; + bool print_backward_graph; + + int n_gradient_accumulation; + + // ADAM parameters + struct { + int n_iter; + + float sched; // schedule multiplier (fixed, decay or warmup) + float decay; // weight decay for AdamW, use 0.0f to disable + int decay_min_ndim; // minimum number of tensor dimension to apply weight decay + float alpha; // learning rate + float beta1; + float beta2; + float eps; // epsilon for numerical stability + float eps_f; // epsilon for convergence test + float eps_g; // epsilon for convergence test + float gclip; // gradient clipping + } adam; + + // LBFGS parameters + struct { + int m; // number of corrections to approximate the inv. Hessian + int n_iter; + int max_linesearch; + + float eps; // convergence tolerance + float ftol; // line search tolerance + float wolfe; + float min_step; + float max_step; + + enum ggml_linesearch linesearch; + } lbfgs; + }; + + struct ggml_opt_context { + struct ggml_context * ctx; + struct ggml_opt_params params; + + int iter; + int64_t nx; // number of parameter elements + + bool just_initialized; + + float loss_before; + float loss_after; + + struct { + struct ggml_tensor * g; // current gradient + struct ggml_tensor * m; // first moment + struct ggml_tensor * v; // second moment + struct ggml_tensor * pf; // past function values + float fx_best; + float fx_prev; + int n_no_improvement; + } adam; + + struct { + struct ggml_tensor * x; // current parameters + struct ggml_tensor * xp; // previous parameters + struct ggml_tensor * g; // current gradient + struct ggml_tensor * gp; // previous gradient + struct ggml_tensor * d; // search direction + struct ggml_tensor * pf; // past function values + struct ggml_tensor * lmal; // the L-BFGS memory alpha + struct ggml_tensor * lmys; // the L-BFGS memory ys + struct ggml_tensor * lms; // the L-BFGS memory s + struct ggml_tensor * lmy; // the L-BFGS memory y + float fx_best; + float step; + int j; + int k; + int end; + int n_no_improvement; + } lbfgs; + }; + + GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); + + // optimize the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f); + + // initialize optimizer context + GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx); + + // continue optimizing the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f); + + // continue optimizing the function defined by the tensor f + GGML_API enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data); + + // + // quantization + // + + // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk + GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); + + GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); + + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + + // + // gguf + // + + enum gguf_type { + GGUF_TYPE_UINT8 = 0, + GGUF_TYPE_INT8 = 1, + GGUF_TYPE_UINT16 = 2, + GGUF_TYPE_INT16 = 3, + GGUF_TYPE_UINT32 = 4, + GGUF_TYPE_INT32 = 5, + GGUF_TYPE_FLOAT32 = 6, + GGUF_TYPE_BOOL = 7, + GGUF_TYPE_STRING = 8, + GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, + GGUF_TYPE_COUNT, // marks the end of the enum + }; + + struct gguf_context; + + struct gguf_init_params { + bool no_alloc; + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API const char * gguf_type_name(enum gguf_type type); + + GGML_API int gguf_get_version (const struct gguf_context * ctx); + GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + GGML_API void * gguf_get_data (const struct gguf_context * ctx); + + GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); + GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); + GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id); + + GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id); + GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id); + + // will abort if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id); + GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id); + GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id); + GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id); + GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id); + GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id); + GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id); + GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id); + GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id); + GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id); + GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id); + GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id); + GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id); + GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id); + GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id); + GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); + + GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i); + + // overrides existing values or adds a new one + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); + GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); + GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); + GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); + GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); + GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); + GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); + GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); + GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); + GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n); + + // set or add KV pairs from another context + GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); + + // manage tensor info + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); + + // writing gguf files can be done in 2 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); + // fwrite(f, ...); + // void * data = gguf_meta_get_meta_data(ctx); + // fseek(f, 0, SEEK_SET); + // fwrite(f, data, gguf_get_meta_size(ctx)); + // free(data); + // fclose(f); + // + + // write the entire context to a binary file + GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); + GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); + + // + // system info + // + + GGML_API int ggml_cpu_has_avx (void); + GGML_API int ggml_cpu_has_avx2 (void); + GGML_API int ggml_cpu_has_avx512 (void); + GGML_API int ggml_cpu_has_avx512_vbmi(void); + GGML_API int ggml_cpu_has_avx512_vnni(void); + GGML_API int ggml_cpu_has_fma (void); + GGML_API int ggml_cpu_has_neon (void); + GGML_API int ggml_cpu_has_arm_fma (void); + GGML_API int ggml_cpu_has_metal (void); + GGML_API int ggml_cpu_has_f16c (void); + GGML_API int ggml_cpu_has_fp16_va (void); + GGML_API int ggml_cpu_has_wasm_simd (void); + GGML_API int ggml_cpu_has_blas (void); + GGML_API int ggml_cpu_has_cublas (void); + GGML_API int ggml_cpu_has_clblast (void); + GGML_API int ggml_cpu_has_gpublas (void); + GGML_API int ggml_cpu_has_sse3 (void); + GGML_API int ggml_cpu_has_ssse3 (void); + GGML_API int ggml_cpu_has_vsx (void); + + // + // Internal types and functions exposed for tests and benchmarks + // + +#ifdef __cplusplus +// restrict not standard in C++ +#define GGML_RESTRICT +#else +#define GGML_RESTRICT restrict +#endif + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); + + typedef struct dpct_type_994041 { + const char * type_name; + int blck_size; + size_t type_size; + bool is_quantized; + ggml_to_float_t to_float; + ggml_from_float_t from_float; + ggml_from_float_t from_float_reference; + ggml_vec_dot_t vec_dot; + enum ggml_type vec_dot_type; + } ggml_type_traits_t; + + GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + +#ifdef __cplusplus +} +#endif diff --git a/dpcpp_out2/ggml.h.yaml b/dpcpp_out2/ggml.h.yaml new file mode 100644 index 0000000000000..47d52a213f243 --- /dev/null +++ b/dpcpp_out2/ggml.h.yaml @@ -0,0 +1,100 @@ +--- +MainSourceFile: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/dpcpp_out2/ggml.h' +Replacements: + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h' + Offset: 7458 + Length: 0 + ReplacementText: "#define DPCT_PROFILING_ENABLED\n#include \n#include \n" + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h' + Offset: 10556 + Length: 10 + ReplacementText: SYCL_LANGUAGE_VERSION + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false + - FilePath: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h' + Offset: 82284 + Length: 0 + ReplacementText: ' dpct_type_994041' + ConstantFlag: '' + ConstantOffset: 0 + InitStr: '' + NewHostVarName: '' + BlockLevelFormatFlag: false +MainSourceFilesDigest: + - MainSourceFile: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h' + Digest: a7f88ed7f3bbff01c9713ad58f5dac5b +DpctVersion: 18.0.0 +MainHelperFileName: '' +USMLevel: '' +FeatureMap: {} +CompileTargets: {} +OptionMap: + AnalysisScopePath: + Value: '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub' + Specified: false + AsyncHandler: + Value: 'false' + Specified: false + CommentsEnabled: + Value: 'false' + Specified: false + CompilationsDir: + Value: '' + Specified: false + CtadEnabled: + Value: 'false' + Specified: false + EnablepProfiling: + Value: 'true' + Specified: true + ExperimentalFlag: + Value: '0' + Specified: false + ExplicitClNamespace: + Value: 'false' + Specified: false + ExplicitNamespace: + Value: '20' + Specified: false + ExtensionDDFlag: + Value: '0' + Specified: false + ExtensionDEFlag: + Value: '4294967295' + Specified: false + HelperFuncPreferenceFlag: + Value: '0' + Specified: false + NDRangeDim: + Value: '3' + Specified: false + NoDRYPattern: + Value: 'false' + Specified: false + NoUseGenericSpace: + Value: '' + Specified: true + OptimizeMigration: + Value: 'false' + Specified: false + ProcessAll: + Value: 'false' + Specified: false + RuleFile: + Value: '' + Specified: false + SyclNamedLambda: + Value: 'false' + Specified: false + UsmLevel: + Value: '1' + Specified: false +... diff --git a/dpct/atomic.hpp b/dpct/atomic.hpp new file mode 100644 index 0000000000000..4b516f5304023 --- /dev/null +++ b/dpct/atomic.hpp @@ -0,0 +1,842 @@ +//==---- atomic.hpp -------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_ATOMIC_HPP__ +#define __DPCT_ATOMIC_HPP__ + +#include + +namespace dpct { + +/// Atomically add the value operand to the value at the addr and assign the +/// result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to add to the value at \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_add(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_add(operand); +} + +template +inline T1 atomic_fetch_add(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_add(operand); +} + +/// Atomically add the value operand to the value at the addr and assign the +/// result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to add to the value at \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_add(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_add(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_add(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_add(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +template +inline T1 atomic_fetch_add(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_add(addr, operand, memoryOrder); +} + +/// Atomically subtract the value operand from the value at the addr and assign +/// the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to subtract from the value at \p addr +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_sub(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_sub(operand); +} + +template +inline T1 atomic_fetch_sub(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_sub(operand); +} + +/// Atomically subtract the value operand from the value at the addr and assign +/// the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to subtract from the value at \p addr +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_sub(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_sub(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_sub(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_sub(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +template +inline T1 atomic_fetch_sub(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_sub(addr, operand, memoryOrder); +} + +/// Atomically perform a bitwise AND between the value operand and the value at the addr +/// and assign the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to use in bitwise AND operation with the value at the \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_and(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_and(operand); +} + +template +inline T1 atomic_fetch_and(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_and(operand); +} + +/// Atomically perform a bitwise AND between the value operand and the value at the addr +/// and assign the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to use in bitwise AND operation with the value at the \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_and(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_and(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_and(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_and(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +template +inline T1 atomic_fetch_and(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_and(addr, operand, memoryOrder); +} + +/// Atomically or the value at the addr with the value operand, and assign +/// the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to use in bitwise OR operation with the value at the \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_or(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_or(operand); +} + +template +inline T1 atomic_fetch_or(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_or(operand); +} + +/// Atomically or the value at the addr with the value operand, and assign +/// the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to use in bitwise OR operation with the value at the \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_or(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_or(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_or(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_or(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +template +inline T1 atomic_fetch_or(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_or(addr, operand, memoryOrder); +} + +/// Atomically xor the value at the addr with the value operand, and assign +/// the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to use in bitwise XOR operation with the value at the \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_xor(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_xor(operand); +} + +template +inline T1 atomic_fetch_xor(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_xor(operand); +} + +/// Atomically xor the value at the addr with the value operand, and assign +/// the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to use in bitwise XOR operation with the value at the \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_xor(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_xor(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_xor(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_xor(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +template +inline T1 atomic_fetch_xor(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_xor(addr, operand, memoryOrder); +} + +/// Atomically calculate the minimum of the value at addr and the value operand +/// and assign the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_min(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_min(operand); +} + +template +inline T1 atomic_fetch_min(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_min(operand); +} + +/// Atomically calculate the minimum of the value at addr and the value operand +/// and assign the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_min(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_min(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_min(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_min(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +template +inline T1 atomic_fetch_min(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_min(addr, operand, memoryOrder); +} + +/// Atomically calculate the maximum of the value at addr and the value operand +/// and assign the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_max(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_max(operand); +} + +template +inline T1 atomic_fetch_max(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_max(operand); +} + +/// Atomically calculate the maximum of the value at addr and the value operand +/// and assign the result to the value at addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_fetch_max(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_max(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_max(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_max(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +template +inline T1 atomic_fetch_max(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_max(addr, operand, memoryOrder); +} + +/// Atomically set \p operand to the value stored in \p addr, if old value stored in +/// \p addr is equal to zero or greater than \p operand, else decrease the value stored +/// in \p addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The threshold value. +/// \param memoryOrder The memory ordering used. +/// \returns The old value stored in \p addr. +template +inline unsigned int atomic_fetch_compare_dec(unsigned int *addr, + unsigned int operand) { + auto atm = sycl::atomic_ref(addr[0]); + unsigned int old; + + while (true) { + old = atm.load(); + if (old == 0 || old > operand) { + if (atm.compare_exchange_strong(old, operand)) + break; + } else if (atm.compare_exchange_strong(old, old - 1)) + break; + } + + return old; +} + +/// Atomically increment the value stored in \p addr if old value stored in \p +/// addr is less than \p operand, else set 0 to the value stored in \p addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The threshold value. +/// \param memoryOrder The memory ordering used. +/// \returns The old value stored in \p addr. +template +inline unsigned int atomic_fetch_compare_inc(unsigned int *addr, + unsigned int operand) { + auto atm = sycl::atomic_ref(addr[0]); + unsigned int old; + while (true) { + old = atm.load(); + if (old >= operand) { + if (atm.compare_exchange_strong(old, 0)) + break; + } else if (atm.compare_exchange_strong(old, old + 1)) + break; + } + return old; +} + +/// Atomically increment the value stored in \p addr if old value stored in \p +/// addr is less than \p operand, else set 0 to the value stored in \p addr. +/// \param [in, out] addr The pointer to the data. +/// \param operand The threshold value. +/// \param memoryOrder The memory ordering used. +/// \returns The old value stored in \p addr. +template +inline unsigned int +atomic_fetch_compare_inc(unsigned int *addr, unsigned int operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_compare_inc(addr, + operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_compare_inc(addr, + operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_compare_inc(addr, + operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +/// Atomically exchange the value at the address addr with the value operand. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to be exchanged with the value pointed by \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_exchange(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.exchange(operand); +} + +template +inline T1 atomic_exchange(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.exchange(operand); +} + +/// Atomically exchange the value at the address addr with the value operand. +/// \param [in, out] addr The pointer to the data. +/// \param operand The value to be exchanged with the value pointed by \p addr. +/// \param memoryOrder The memory ordering used. +/// \returns The value at the \p addr before the call. +template +inline T atomic_exchange(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_exchange(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_exchange(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_exchange(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } +} + +template +inline T1 atomic_exchange(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_exchange(addr, operand, memoryOrder); +} + +/// Atomically compare the value at \p addr to the value expected and exchange +/// with the value desired if the value at \p addr is equal to the value expected. +/// Returns the value at the \p addr before the call. +/// \param [in, out] addr Multi_ptr. +/// \param expected The value to compare against the value at \p addr. +/// \param desired The value to assign to \p addr if the value at \p addr is expected. +/// \param success The memory ordering used when comparison succeeds. +/// \param fail The memory ordering used when comparison fails. +/// \returns The value at the \p addr before the call. +template +T atomic_compare_exchange_strong( + sycl::multi_ptr addr, T expected, T desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed) { + auto atm = sycl::atomic_ref(*addr); + + atm.compare_exchange_strong(expected, desired, success, fail); + return expected; +} + +template +T1 atomic_compare_exchange_strong( + sycl::multi_ptr addr, T2 expected, T3 desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed) { + auto atm = + sycl::atomic_ref(*addr); + T1 expected_value = expected; + atm.compare_exchange_strong(expected_value, desired, success, fail); + return expected_value; +} + +/// Atomically compare the value at \p addr to the value expected and exchange +/// with the value desired if the value at \p addr is equal to the value expected. +/// Returns the value at the \p addr before the call. +/// \param [in] addr The pointer to the data. +/// \param expected The value to compare against the value at \p addr. +/// \param desired The value to assign to \p addr if the value at \p addr is expected. +/// \param success The memory ordering used when comparison succeeds. +/// \param fail The memory ordering used when comparison fails. +/// \returns The value at the \p addr before the call. +template +T atomic_compare_exchange_strong( + T *addr, T expected, T desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed) { + auto atm = + sycl::atomic_ref(addr[0]); + atm.compare_exchange_strong(expected, desired, success, fail); + return expected; +} + +template +T1 atomic_compare_exchange_strong( + T1 *addr, T2 expected, T3 desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed) { + T1 expected_value = expected; + auto atm = + sycl::atomic_ref(addr[0]); + atm.compare_exchange_strong(expected_value, desired, success, fail); + return expected_value; +} + +/// Atomic extension to implement standard APIs in std::atomic +namespace detail{ +template struct IsValidAtomicType { + static constexpr bool value = + (std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || std::is_same::value || + std::is_pointer::value); +}; +} // namespace detail + +template +class atomic{ + static_assert( + detail::IsValidAtomicType::value, + "Invalid atomic type. Valid types are int, unsigned int, long, " + "unsigned long, long long, unsigned long long, float, double " + "and pointer types"); + T __d; + +public: + /// default memory synchronization order + static constexpr sycl::memory_order default_read_order = + sycl::atomic_ref::default_read_order; + static constexpr sycl::memory_order default_write_order = + sycl::atomic_ref::default_write_order; + static constexpr sycl::memory_scope default_scope = DefaultScope; + static constexpr sycl::memory_order default_read_modify_write_order = + DefaultOrder; + + + /// Default constructor. + constexpr atomic() noexcept = default; + /// Constructor with initialize value. + constexpr atomic(T d) noexcept : __d(d){}; + + /// atomically replaces the value of the referenced object with a non-atomic argument + /// \param operand The value to replace the pointed value. + /// \param memoryOrder The memory ordering used. + /// \param memoryScope The memory scope used. + void store(T operand, sycl::memory_order memoryOrder = default_write_order, + sycl::memory_scope memoryScope = default_scope) noexcept { + sycl::atomic_ref atm(__d); + atm.store(operand, memoryOrder, memoryScope); + } + + /// atomically obtains the value of the referenced object + /// \param memoryOrder The memory ordering used. + /// \param memoryScope The memory scope used. + /// \returns The value of the referenced object + T load(sycl::memory_order memoryOrder = default_read_order, + sycl::memory_scope memoryScope = default_scope) const noexcept { + sycl::atomic_ref atm( + const_cast(__d)); + return atm.load(memoryOrder, memoryScope); + } + + /// atomically replaces the value of the referenced object and obtains the value held previously + /// \param operand The value to replace the pointed value. + /// \param memoryOrder The memory ordering used. + /// \param memoryScope The memory scope used. + /// \returns The value of the referenced object before the call. + T exchange(T operand, + sycl::memory_order memoryOrder = default_read_modify_write_order, + sycl::memory_scope memoryScope = default_scope) noexcept { + + sycl::atomic_ref atm(__d); + return atm.exchange(operand, memoryOrder, memoryScope); + } + + /// atomically compares the value of the referenced object with non-atomic argument + /// and performs atomic exchange if equal or atomic load if not + /// \param expected The value expected to be found in the object referenced by the atomic_ref object + /// \param desired The value to store in the referenced object if it is as expected + /// \param success The memory models for the read-modify-write + /// \param failure The memory models for load operations + /// \param memoryScope The memory scope used. + /// \returns true if the referenced object was successfully changed, false otherwise. + bool compare_exchange_weak( + T &expected, T desired, + sycl::memory_order success, sycl::memory_order failure, + sycl::memory_scope memoryScope = default_scope) noexcept { + sycl::atomic_ref atm(__d); + return atm.compare_exchange_weak(expected, desired, success, failure, memoryScope); + } + /// \param expected The value expected to be found in the object referenced by the atomic_ref object + /// \param desired The value to store in the referenced object if it is as expected + /// \param memoryOrder The memory synchronization ordering for operations + /// \param memoryScope The memory scope used. + /// \returns true if the referenced object was successfully changed, false otherwise. + bool compare_exchange_weak(T &expected, T desired, + sycl::memory_order memoryOrder = default_read_modify_write_order, + sycl::memory_scope memoryScope = default_scope) noexcept { + sycl::atomic_ref atm(__d); + return atm.compare_exchange_weak(expected, desired, memoryOrder, memoryScope); + } + + /// atomically compares the value of the referenced object with non-atomic argument + /// and performs atomic exchange if equal or atomic load if not + /// \param expected The value expected to be found in the object referenced by the atomic_ref object + /// \param desired The value to store in the referenced object if it is as expected + /// \param success The memory models for the read-modify-write + /// \param failure The memory models for load operations + /// \param memoryScope The memory scope used. + /// \returns true if the referenced object was successfully changed, false otherwise. + bool compare_exchange_strong( + T &expected, T desired, + sycl::memory_order success, sycl::memory_order failure, + sycl::memory_scope memoryScope = default_scope) noexcept { + + sycl::atomic_ref atm(__d); + return atm.compare_exchange_strong(expected, desired, success, failure, memoryScope); + } + /// \param expected The value expected to be found in the object referenced by the atomic_ref object + /// \param desired The value to store in the referenced object if it is as expected + /// \param memoryOrder The memory synchronization ordering for operations + /// \param memoryScope The memory scope used. + /// \returns true if the referenced object was successfully changed, false otherwise. + bool compare_exchange_strong(T &expected, T desired, + sycl::memory_order memoryOrder = default_read_modify_write_order, + sycl::memory_scope memoryScope = default_scope) noexcept { + sycl::atomic_ref atm(__d); + return atm.compare_exchange_strong(expected, desired, memoryOrder, memoryScope); + } + + /// atomically adds the argument to the value stored in the atomic object and obtains the value held previously + /// \param operand The other argument of arithmetic addition + /// \param memoryOrder The memory ordering used. + /// \param memoryScope The memory scope used. + /// \returns The value of the referenced object before the call. + T fetch_add(T operand, + sycl::memory_order memoryOrder = default_read_modify_write_order, + sycl::memory_scope memoryScope = default_scope) noexcept { + + sycl::atomic_ref atm(__d); + return atm.fetch_add(operand, memoryOrder, memoryScope); + } + + /// atomically subtracts the argument from the value stored in the atomic object and obtains the value held previously + /// \param operand The other argument of arithmetic subtraction + /// \param memoryOrder The memory ordering used. + /// \param memoryScope The memory scope used. + /// \returns The value of the referenced object before the call. + T fetch_sub(T operand, + sycl::memory_order memoryOrder = default_read_modify_write_order, + sycl::memory_scope memoryScope = default_scope) noexcept { + + sycl::atomic_ref atm(__d); + return atm.fetch_sub(operand, memoryOrder, memoryScope); + } +}; + +} // namespace dpct +#endif // __DPCT_ATOMIC_HPP__ diff --git a/dpct/blas_utils.hpp b/dpct/blas_utils.hpp new file mode 100644 index 0000000000000..df222c528bc08 --- /dev/null +++ b/dpct/blas_utils.hpp @@ -0,0 +1,1792 @@ +//==---- blas_utils.hpp----------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_BLAS_UTILS_HPP__ +#define __DPCT_BLAS_UTILS_HPP__ + +#include "memory.hpp" +#include "util.hpp" +#include "lib_common_utils.hpp" +#include +#include +#include +#include +#include + +namespace dpct { + +/// Get the value of \p s. +/// Copy the data to host synchronously, then return the data. +/// \param [in] p The pointer points the data. +/// \param [in] q The queue where the memory copy should be executed. +template +inline auto get_value(const T *s, sycl::queue &q) { + return detail::get_value(s, q); +} + +namespace detail { +inline void mem_free(sycl::queue *exec_queue, + std::vector pointers_array, sycl::event e) { + e.wait(); + for (auto p : pointers_array) + sycl::free(p, *exec_queue); +} + +inline int stride_for(int num_elems, int mem_align_in_elems) { + return ((num_elems - 1) / mem_align_in_elems + 1) * mem_align_in_elems; +} + +#ifndef DPCT_USM_LEVEL_NONE +template +class working_memory { + T *_input_ptr; + T *_temp_ptr; + bool _is_sycl_malloced = false; + bool _is_scalar_value = false; + sycl::queue _q; + sycl::event _e; + +public: + working_memory(size_t size, sycl::queue q) : _q(q) { + _is_scalar_value = false; + _temp_ptr = (T *)sycl::malloc_device(size, q); + } + working_memory(T *result_ptr, sycl::queue q) : _input_ptr(result_ptr), _q(q) { + _is_scalar_value = true; + _is_sycl_malloced = sycl::get_pointer_type(_input_ptr, _q.get_context()) != + sycl::usm::alloc::unknown; + if (!_is_sycl_malloced) + _temp_ptr = sycl::malloc_shared(1, _q); + } + auto get_ptr() { + if (_is_scalar_value && _is_sycl_malloced) + return _input_ptr; + return _temp_ptr; + } + void set_event(sycl::event e) { _e = e; } + ~working_memory() { + if (_is_scalar_value) { + if (!_is_sycl_malloced) { + _q.memcpy(_input_ptr, _temp_ptr, sizeof(T)).wait(); + sycl::free(_temp_ptr, _q); + } + } else { + std::vector ptrs{_temp_ptr}; + dpct::async_dpct_free(ptrs, {_e}); + } + } +}; +#endif + +template +inline void nrm2_impl(sycl::queue &q, int n, const void *x, int incx, + void *result) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " + "Project does not support this API."); +#else +#ifdef DPCT_USM_LEVEL_NONE + auto x_buffer = dpct::get_buffer(x); + auto r_buffer = + sycl::buffer(reinterpret_cast(result), sycl::range<1>(1)); + if (dpct::is_device_ptr(result)) + r_buffer = dpct::get_buffer(result); + oneapi::mkl::blas::column_major::nrm2(q, n, x_buffer, incx, r_buffer); +#else + working_memory res_mem(reinterpret_cast(result), q); + oneapi::mkl::blas::column_major::nrm2(q, n, reinterpret_cast(x), + incx, res_mem.get_ptr()); +#endif +#endif +} + +template +inline void dotuc_impl(sycl::queue &q, int n, const Txy *x, int incx, + const Txy *y, int incy, Tr *result) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " + "Project does not support this API."); +#else +#ifdef DPCT_USM_LEVEL_NONE + auto x_buffer = dpct::get_buffer(x); + auto y_buffer = dpct::get_buffer(y); + auto r_buffer = sycl::buffer((Tr *)result, sycl::range<1>(1)); + if (dpct::is_device_ptr(result)) + r_buffer = dpct::get_buffer(result); + if constexpr (std::is_same_v> || + std::is_same_v>) { + if constexpr (is_conjugate) + oneapi::mkl::blas::column_major::dotc(q, n, x_buffer, incx, y_buffer, + incy, r_buffer); + else + oneapi::mkl::blas::column_major::dotu(q, n, x_buffer, incx, y_buffer, + incy, r_buffer); + } else + oneapi::mkl::blas::column_major::dot(q, n, x_buffer, incx, y_buffer, incy, + r_buffer); +#else + working_memory res_mem(result, q); + if constexpr (std::is_same_v> || + std::is_same_v>) { + if constexpr (is_conjugate) + oneapi::mkl::blas::column_major::dotc(q, n, x, incx, y, incy, res_mem.get_ptr()); + else + oneapi::mkl::blas::column_major::dotu(q, n, x, incx, y, incy, res_mem.get_ptr()); + } else + oneapi::mkl::blas::column_major::dot(q, n, x, incx, y, incy, res_mem.get_ptr()); +#endif +#endif +} + +template +inline void dotuc(sycl::queue &q, int n, const void *x, + library_data_t x_type, int incx, const void *y, + library_data_t y_type, int incy, void *result, + library_data_t result_type) { + std::uint64_t key = detail::get_type_combination_id(x_type, y_type, result_type); + switch (key) { + case detail::get_type_combination_id(library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float): { + detail::dotuc_impl( + q, n, reinterpret_cast(x), incx, + reinterpret_cast(y), incy, + reinterpret_cast(result)); + break; + } + case detail::get_type_combination_id(library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double): { + detail::dotuc_impl( + q, n, reinterpret_cast(x), incx, + reinterpret_cast(y), incy, + reinterpret_cast(result)); + break; + } + case detail::get_type_combination_id(library_data_t::complex_float, + library_data_t::complex_float, + library_data_t::complex_float): { + detail::dotuc_impl( + q, n, reinterpret_cast *>(x), incx, + reinterpret_cast *>(y), incy, + reinterpret_cast *>(result)); + break; + } + case detail::get_type_combination_id(library_data_t::complex_double, + library_data_t::complex_double, + library_data_t::complex_double): { + detail::dotuc_impl( + q, n, reinterpret_cast *>(x), incx, + reinterpret_cast *>(y), incy, + reinterpret_cast *>(result)); + break; + } + case detail::get_type_combination_id(library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half): { + detail::dotuc_impl( + q, n, reinterpret_cast(x), incx, + reinterpret_cast(y), incy, + reinterpret_cast(result)); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +} + +template +inline void scal_impl(sycl::queue &q, int n, const void *alpha, void *x, + int incx) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " + "Project does not support this API."); +#else + Te alpha_val = dpct::get_value(reinterpret_cast(alpha), q); + auto data_x = get_memory(x); + oneapi::mkl::blas::column_major::scal(q, n, alpha_val, + data_x, incx); +#endif +} + +template +inline void axpy_impl(sycl::queue &q, int n, const void *alpha, const void *x, + int incx, void *y, int incy) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " + "Project does not support this API."); +#else + Te alpha_val = dpct::get_value(reinterpret_cast(alpha), q); + auto data_x = get_memory(x); + auto data_y = get_memory(y); + oneapi::mkl::blas::column_major::axpy(q, n, alpha_val, + data_x, incx, + data_y, incy); +#endif +} + +template +inline void rot_impl(sycl::queue &q, int n, void *x, int incx, void *y, + int incy, const void *c, const void *s) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " + "Project does not support this API."); +#else + Tc c_value = dpct::get_value(reinterpret_cast(c), q); + Ts s_value = dpct::get_value(reinterpret_cast(s), q); + auto data_x = get_memory(x); + auto data_y = get_memory(y); + oneapi::mkl::blas::column_major::rot(q, n, data_x, incx, + data_y, incy, c_value, + s_value); +#endif +} + +template +inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void *a, int lda, const void *b, + int ldb, const void *beta, void *c, int ldc) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " + "Project does not support this API."); +#else + Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); + auto data_a = get_memory(a); + auto data_b = get_memory(b); + auto data_c = get_memory(c); + oneapi::mkl::blas::column_major::gemm( + q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, + data_b, ldb, beta_value, data_c, ldc); +#endif +} + +template +inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void **a, int lda, + const void **b, int ldb, const void *beta, void **c, + int ldc, int batch_size) { + struct matrix_info_t { + oneapi::mkl::transpose transpose_info[2]; + Ts value_info[2]; + std::int64_t size_info[3]; + std::int64_t ld_info[3]; + std::int64_t groupsize_info; + }; + + Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); + + matrix_info_t *matrix_info = + (matrix_info_t *)std::malloc(sizeof(matrix_info_t)); + matrix_info->transpose_info[0] = a_trans; + matrix_info->transpose_info[1] = b_trans; + matrix_info->value_info[0] = alpha_value; + matrix_info->value_info[1] = beta_value; + matrix_info->size_info[0] = m; + matrix_info->size_info[1] = n; + matrix_info->size_info[2] = k; + matrix_info->ld_info[0] = lda; + matrix_info->ld_info[1] = ldb; + matrix_info->ld_info[2] = ldc; + matrix_info->groupsize_info = batch_size; + + sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( + q, matrix_info->transpose_info, matrix_info->transpose_info + 1, + matrix_info->size_info, matrix_info->size_info + 1, + matrix_info->size_info + 2, matrix_info->value_info, + reinterpret_cast(a), matrix_info->ld_info, + reinterpret_cast(b), matrix_info->ld_info + 1, + matrix_info->value_info + 1, reinterpret_cast(c), + matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); + + q.submit([&](sycl::handler &cgh) { + cgh.depends_on(e); + cgh.host_task([=] { std::free(matrix_info); }); + }); +} + +template +inline void +gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, + int k, const void *alpha, const void *a, int lda, + long long int stride_a, const void *b, int ldb, + long long int stride_b, const void *beta, void *c, + int ldc, long long int stride_c, int batch_size) { + Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); + auto data_a = get_memory(a); + auto data_b = get_memory(b); + auto data_c = get_memory(c); + oneapi::mkl::blas::column_major::gemm_batch( + q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, + stride_a, data_b, ldb, stride_b, beta_value, + data_c, ldc, stride_c, batch_size); +} + +template +inline void rk_impl(sycl::queue &q, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, int n, int k, + const T *alpha, const T *a, int lda, const T *b, + int ldb, const Tbeta *beta, T *c, int ldc) { + // For symmetric matrix, this function performs: C = alpha*OP(A)*(OP(B))^T + beta*C + // For Hermitian matrix, this function performs: C = alpha*OP(A)*(OP(B))^H + beta*C + // The gemmt() function performs: C = alpha*OPA(A)*OPB(B) + beta*C + // So the OPB need be updated before we call gemmt(). + using Ty = typename dpct::DataType::T2; + using Ts = typename dpct::DataType::T2; + Ty alpha_value = dpct::get_value(reinterpret_cast(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); + oneapi::mkl::transpose trans_A = trans, trans_B = trans; + int origin_b_rows = trans == oneapi::mkl::transpose::nontrans ? n : k; + int origin_b_cols = trans == oneapi::mkl::transpose::nontrans ? k : n; + + if ((is_hermitian && trans == oneapi::mkl::transpose::trans) || + (!is_hermitian && !std::is_floating_point_v && trans == oneapi::mkl::transpose::conjtrans)) { + // In this case, OPB need be a conjugate operation, + // but only notrans, conjtrans and trans are available. + // So we need do a conjtrans operation first, then do a trans operation. + trans_B = oneapi::mkl::transpose::trans; + auto data_a = get_memory(a); + auto data_c = get_memory(c); +#ifdef DPCT_USM_LEVEL_NONE + auto new_B_buffer = sycl::buffer(sycl::range<1>(origin_b_rows * origin_b_cols)); + auto from_buffer = dpct::get_buffer(b); + oneapi::mkl::blas::column_major::omatcopy_batch( + q, oneapi::mkl::transpose::conjtrans, origin_b_rows, origin_b_cols, + Ts(1.0), from_buffer, ldb, origin_b_rows * ldb, new_B_buffer, + origin_b_cols, origin_b_rows * origin_b_cols, 1); + oneapi::mkl::blas::column_major::gemmt( + q, uplo, trans_A, trans_B, n, k, alpha_value, + data_a, lda, new_B_buffer, origin_b_cols, beta_value, data_c, ldc); +#else + working_memory new_B(origin_b_rows * origin_b_cols * sizeof(T), q); + oneapi::mkl::blas::column_major::omatcopy_batch( + q, oneapi::mkl::transpose::conjtrans, origin_b_rows, origin_b_cols, + Ts(1.0), reinterpret_cast(b), ldb, origin_b_rows * ldb, + reinterpret_cast(new_B.get_ptr()), origin_b_cols, + origin_b_rows * origin_b_cols, 1); + sycl::event e = oneapi::mkl::blas::column_major::gemmt( + q, uplo, trans_A, trans_B, n, k, alpha_value, + data_a, lda, reinterpret_cast(new_B.get_ptr()), origin_b_cols, + beta_value, data_c, ldc); + new_B.set_event(e); +#endif + } else { + if constexpr (is_hermitian) { + trans_B = trans == oneapi::mkl::transpose::nontrans + ? oneapi::mkl::transpose::conjtrans + : oneapi::mkl::transpose::nontrans; + } else { + trans_B = trans == oneapi::mkl::transpose::nontrans + ? oneapi::mkl::transpose::trans + : oneapi::mkl::transpose::nontrans; + } + auto data_a = get_memory(a); + auto data_b = get_memory(b); + auto data_c = get_memory(c); + oneapi::mkl::blas::column_major::gemmt( + q, uplo, trans_A, trans_B, n, k, alpha_value, + data_a, lda, data_b, ldb, beta_value, data_c, ldc); + } +} + +template +inline void +trsm_batch_impl(sycl::queue &q, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, int m, int n, const void *alpha, + const void **a, int lda, void **b, int ldb, int batch_size) { + struct matrix_info_t { + matrix_info_t(oneapi::mkl::side side_info, oneapi::mkl::uplo uplo_info, + oneapi::mkl::transpose transpose_info, + oneapi::mkl::diag diag_info, Ts value_info, std::int64_t m, + std::int64_t n, std::int64_t lda, std::int64_t ldb, + std::int64_t groupsize_info) + : side_info(side_info), uplo_info(uplo_info), + transpose_info(transpose_info), diag_info(diag_info), + value_info(value_info), groupsize_info(groupsize_info) { + size_info[0] = m; + size_info[1] = n; + ld_info[0] = lda; + ld_info[1] = ldb; + } + oneapi::mkl::side side_info; + oneapi::mkl::uplo uplo_info; + oneapi::mkl::transpose transpose_info; + oneapi::mkl::diag diag_info; + Ts value_info; + std::int64_t size_info[2]; + std::int64_t ld_info[2]; + std::int64_t groupsize_info; + }; + + Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); + + matrix_info_t *matrix_info = + new matrix_info_t(left_right, upper_lower, trans, unit_diag, alpha_value, + m, n, lda, ldb, batch_size); + + sycl::event e = oneapi::mkl::blas::column_major::trsm_batch( + q, &(matrix_info->side_info), &(matrix_info->uplo_info), + &(matrix_info->transpose_info), &(matrix_info->diag_info), + matrix_info->size_info, matrix_info->size_info + 1, + &(matrix_info->value_info), reinterpret_cast(a), + matrix_info->ld_info, reinterpret_cast(b), + matrix_info->ld_info + 1, 1, &(matrix_info->groupsize_info)); + + q.submit([&](sycl::handler &cgh) { + cgh.depends_on(e); + cgh.host_task([=] { delete matrix_info; }); + }); +} + +template +inline void getrfnp_batch_wrapper(sycl::queue &exec_queue, int n, T *a[], + int lda, int *info, int batch_size) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " + "Project does not support this API."); +#else + using Ty = typename DataType::T2; + // Set the info array value to 0 + detail::dpct_memset(exec_queue, info, 0, sizeof(int) * batch_size); + std::int64_t stride_a = n * lda; + std::int64_t scratchpad_size = + oneapi::mkl::lapack::getrfnp_batch_scratchpad_size( + exec_queue, n, n, lda, stride_a, batch_size); + + Ty *a_strided_mem = + (Ty *)dpct::dpct_malloc(stride_a * batch_size * sizeof(Ty), exec_queue); + T **host_a = (T **)std::malloc(batch_size * sizeof(T *)); + dpct::dpct_memcpy(host_a, a, batch_size * sizeof(T *)); + for (std::int64_t i = 0; i < batch_size; ++i) + dpct::dpct_memcpy(a_strided_mem + i * stride_a, host_a[i], + n * lda * sizeof(T)); + +#ifdef DPCT_USM_LEVEL_NONE + { + sycl::buffer scratchpad{sycl::range<1>(scratchpad_size)}; + auto a_buffer = get_buffer(a_strided_mem); + oneapi::mkl::lapack::getrfnp_batch(exec_queue, n, n, a_buffer, lda, + stride_a, batch_size, scratchpad, + scratchpad_size); + } + std::vector events; + for (std::int64_t i = 0; i < batch_size; ++i) + events.push_back(detail::dpct_memcpy(exec_queue, host_a[i], + a_strided_mem + i * stride_a, + n * lda * sizeof(T), automatic)); +#else + Ty *scratchpad = sycl::malloc_device(scratchpad_size, exec_queue); + sycl::event e = oneapi::mkl::lapack::getrfnp_batch( + exec_queue, n, n, a_strided_mem, lda, stride_a, batch_size, scratchpad, + scratchpad_size); + std::vector events; + for (std::int64_t i = 0; i < batch_size; ++i) + events.push_back(detail::dpct_memcpy(exec_queue, host_a[i], + a_strided_mem + i * stride_a, + n * lda * sizeof(T), automatic, {e})); + + std::vector ptrs{scratchpad, a_strided_mem}; + dpct::async_dpct_free(ptrs, events, exec_queue); +#endif + + exec_queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(events); + cgh.host_task([=] { std::free(host_a); }); + }); +#endif +} + +} // namespace detail + +inline oneapi::mkl::transpose get_transpose(int t) { + if (t == 0) { + return oneapi::mkl::transpose::nontrans; + } else if (t == 1) { + return oneapi::mkl::transpose::trans; + } else { + return oneapi::mkl::transpose::conjtrans; + } +} + +/// Computes the LU factorizations of a batch of general matrices. +/// \param [in] exec_queue The queue where the routine should be executed. +/// \param [in] n The order of the matrices. +/// \param [in, out] a Array of pointers to matrices. These matrices will be +/// overwritten by lower triangulars with unit diagonal elements and upper +/// triangulars. +/// \param [in] lda The leading dimension of the matrices. +/// \param [out] ipiv An array stores the pivot indices. If \p ipiv is nullptr, +/// non-pivoting LU factorization is computed. +/// \param [out] info An array stores the error information. +/// \param [in] batch_size The size of the batch. +template +inline void getrf_batch_wrapper(sycl::queue &exec_queue, int n, T *a[], + int lda, int *ipiv, int *info, int batch_size) { + if (ipiv == nullptr) { + detail::getrfnp_batch_wrapper(exec_queue, n, a, lda, info, batch_size); + return; + } + using Ty = typename DataType::T2; + // Set the info array value to 0 + detail::dpct_memset(exec_queue, info, 0, sizeof(int) * batch_size); +#ifdef DPCT_USM_LEVEL_NONE + std::int64_t stride_a = n * lda; + std::int64_t stride_ipiv = n; + std::int64_t scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size( + exec_queue, n, n, lda, stride_a, stride_ipiv, batch_size); + + T *a_buffer_ptr; + a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T)); + + T **host_a = (T **)std::malloc(batch_size * sizeof(T *)); + dpct_memcpy(host_a, a, batch_size * sizeof(T *)); + for (std::int64_t i = 0; i < batch_size; ++i) + dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T)); + + { + sycl::buffer ipiv_buf( + sycl::range<1>(batch_size * stride_ipiv)); + sycl::buffer scratchpad{sycl::range<1>(scratchpad_size)}; + auto a_buffer = get_buffer(a_buffer_ptr); + oneapi::mkl::lapack::getrf_batch(exec_queue, n, n, a_buffer, lda, stride_a, + ipiv_buf, stride_ipiv, batch_size, scratchpad, + scratchpad_size); + + auto to_buffer = get_buffer(ipiv); + exec_queue.submit([&](sycl::handler &cgh) { + auto from_acc = ipiv_buf.get_access(cgh); + auto to_acc = to_buffer.get_access(cgh); + cgh.parallel_for>( + sycl::range<2>(batch_size, n), [=](sycl::id<2> id) { + to_acc[id.get(0) * n + id.get(1)] = + static_cast(from_acc[id.get(0) * stride_ipiv + id.get(1)]); + }); + }); + } + + // Copy back to the original buffers + std::vector events; + for (std::int64_t i = 0; i < batch_size; ++i) + events.push_back(detail::dpct_memcpy(exec_queue, host_a[i], + a_buffer_ptr + i * stride_a, + n * lda * sizeof(T), automatic)); + + std::vector ptrs{host_a}; + std::thread mem_free_thread( + [=](std::vector pointers_array, + std::vector events_array) { + sycl::event::wait(events_array); + for (auto p : pointers_array) + std::free(p); + }, + ptrs, events); + mem_free_thread.detach(); +#else + std::int64_t m_int64 = n; + std::int64_t n_int64 = n; + std::int64_t lda_int64 = lda; + std::int64_t group_sizes = batch_size; + std::int64_t scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size( + exec_queue, &m_int64, &n_int64, &lda_int64, 1, &group_sizes); + + Ty *scratchpad = sycl::malloc_device(scratchpad_size, exec_queue); + std::int64_t *ipiv_int64 = + sycl::malloc_device(batch_size * n, exec_queue); + std::int64_t **ipiv_int64_ptr = + sycl::malloc_shared(batch_size, exec_queue); + T **a_shared = sycl::malloc_shared(batch_size, exec_queue); + exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *)).wait(); + for (std::int64_t i = 0; i < batch_size; ++i) + ipiv_int64_ptr[i] = ipiv_int64 + n * i; + + oneapi::mkl::lapack::getrf_batch(exec_queue, &m_int64, &n_int64, (Ty **)a_shared, &lda_int64, + ipiv_int64_ptr, 1, &group_sizes, scratchpad, + scratchpad_size); + + sycl::event e = exec_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for>( + sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) { + ipiv[idx] = static_cast(ipiv_int64[idx]); + }); + }); + + std::vector ptrs{scratchpad, ipiv_int64, ipiv_int64_ptr, a_shared}; + async_dpct_free(ptrs, {e}, exec_queue); +#endif +} + +/// Solves a system of linear equations with a batch of LU-factored square +/// coefficient matrices, with multiple right-hand sides. +/// \param [in] exec_queue The queue where the routine should be executed. +/// \param [in] trans Indicates the form of the linear equations. +/// \param [in] n The order of the matrices. +/// \param [in] nrhs The number of right hand sides. +/// \param [in] a Array of pointers to matrices. +/// \param [in] lda The leading dimension of the matrices in \p a. +/// \param [in] ipiv An array stores the pivots. +/// \param [in, out] b Array of pointers to matrices, whose columns are +/// the right-hand sides for the systems of equations. +/// \param [in] ldb The leading dimension of the matrices in \p b. +/// \param [out] info A value stores the error information. +/// \param [in] batch_size The size of the batch. +template +inline void getrs_batch_wrapper(sycl::queue &exec_queue, + oneapi::mkl::transpose trans, int n, int nrhs, + const T *a[], int lda, const int *ipiv, T *b[], + int ldb, int *info, int batch_size) { + using Ty = typename DataType::T2; + // Set the info value to 0 + *info = 0; +#ifdef DPCT_USM_LEVEL_NONE + std::int64_t stride_a = n * lda; + std::int64_t stride_b = nrhs * ldb; + std::int64_t stride_ipiv = n; + std::int64_t scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size( + exec_queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, + batch_size); + + T *a_buffer_ptr, *b_buffer_ptr; + a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T)); + b_buffer_ptr = (T *)dpct_malloc(stride_b * batch_size * sizeof(T)); + + T **host_a = (T **)std::malloc(batch_size * sizeof(T *)); + T **host_b = (T **)std::malloc(batch_size * sizeof(T *)); + dpct_memcpy(host_a, a, batch_size * sizeof(T *)); + dpct_memcpy(host_b, b, batch_size * sizeof(T *)); + for (std::int64_t i = 0; i < batch_size; ++i) { + dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T)); + dpct_memcpy(b_buffer_ptr + i * stride_b, host_b[i], nrhs * ldb * sizeof(T)); + } + + { + auto a_buffer = get_buffer(a_buffer_ptr); + auto b_buffer = get_buffer(b_buffer_ptr); + sycl::buffer scratchpad{sycl::range<1>(scratchpad_size)}; + sycl::buffer ipiv_buf( + sycl::range<1>(batch_size * stride_ipiv)); + auto from_buf = get_buffer(ipiv); + exec_queue.submit([&](sycl::handler &cgh) { + auto from_acc = from_buf.get_access(cgh); + auto to_acc = ipiv_buf.get_access(cgh); + cgh.parallel_for>( + sycl::range<2>(batch_size, n), [=](sycl::id<2> id) { + to_acc[id.get(0) * stride_ipiv + id.get(1)] = + static_cast(from_acc[id.get(0) * n + id.get(1)]); + }); + }); + + oneapi::mkl::lapack::getrs_batch(exec_queue, trans, n, nrhs, a_buffer, lda, + stride_a, ipiv_buf, stride_ipiv, b_buffer, ldb, + stride_b, batch_size, scratchpad, scratchpad_size); + } + + // Copy back to the original buffers + std::vector events; + for (std::int64_t i = 0; i < batch_size; ++i) + events.push_back(detail::dpct_memcpy(exec_queue, host_b[i], + b_buffer_ptr + i * stride_b, + nrhs * ldb * sizeof(T), automatic)); + std::vector ptrs{host_a, host_b}; + std::thread mem_free_thread( + [=](std::vector pointers_array, + std::vector events_array) { + sycl::event::wait(events_array); + for (auto p : pointers_array) + std::free(p); + }, + ptrs, events); + mem_free_thread.detach(); +#else + std::int64_t n_int64 = n; + std::int64_t nrhs_int64 = nrhs; + std::int64_t lda_int64 = lda; + std::int64_t ldb_int64 = ldb; + std::int64_t group_sizes = batch_size; + std::int64_t scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size( + exec_queue, &trans, &n_int64, &nrhs_int64, &lda_int64, &ldb_int64, 1, + &group_sizes); + + Ty *scratchpad = sycl::malloc_device(scratchpad_size, exec_queue); + std::int64_t *ipiv_int64 = + sycl::malloc_device(batch_size * n, exec_queue); + std::int64_t **ipiv_int64_ptr = + sycl::malloc_shared(batch_size, exec_queue); + T **a_shared = sycl::malloc_shared(batch_size, exec_queue); + T **b_shared = sycl::malloc_shared(batch_size, exec_queue); + exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *)); + exec_queue.memcpy(b_shared, b, batch_size * sizeof(T *)); + + exec_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for>( + sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) { + ipiv_int64[idx] = static_cast(ipiv[idx]); + }); + }).wait(); + + for (std::int64_t i = 0; i < batch_size; ++i) + ipiv_int64_ptr[i] = ipiv_int64 + n * i; + + sycl::event e = oneapi::mkl::lapack::getrs_batch( + exec_queue, &trans, &n_int64, &nrhs_int64, (Ty **)a_shared, &lda_int64, + ipiv_int64_ptr, (Ty **)b_shared, &ldb_int64, 1, &group_sizes, scratchpad, + scratchpad_size); + + std::vector ptrs{scratchpad, ipiv_int64_ptr, ipiv_int64, a_shared, b_shared}; + async_dpct_free(ptrs, {e}, exec_queue); +#endif +} + +/// Computes the inverses of a batch of LU-factored matrices. +/// \param [in] exec_queue The queue where the routine should be executed. +/// \param [in] n The order of the matrices. +/// \param [in] a Array of pointers to matrices. +/// \param [in] lda The leading dimension of the matrices in \p a. +/// \param [in] ipiv An array stores the pivots. +/// \param [out] b Array of pointers to inverse matrices. +/// \param [in] ldb The leading dimension of the matrices in \p b. +/// \param [out] info An array stores the error information. +/// \param [in] batch_size The size of the batch. +template +inline void getri_batch_wrapper(sycl::queue &exec_queue, int n, + const T *a[], int lda, int *ipiv, T *b[], + int ldb, int *info, int batch_size) { + using Ty = typename DataType::T2; + // Set the info array value to 0 + detail::dpct_memset(exec_queue, info, 0, sizeof(int) * batch_size); +#ifdef DPCT_USM_LEVEL_NONE + std::int64_t stride_b = n * ldb; + std::int64_t stride_ipiv = n; + std::int64_t scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size( + exec_queue, n, ldb, stride_b, stride_ipiv, batch_size); + + T *b_buffer_ptr; + b_buffer_ptr = (T *)dpct_malloc(stride_b * batch_size * sizeof(T)); + + T **host_a = (T **)std::malloc(batch_size * sizeof(T *)); + T **host_b = (T **)std::malloc(batch_size * sizeof(T *)); + dpct_memcpy(host_a, a, batch_size * sizeof(T *)); + dpct_memcpy(host_b, b, batch_size * sizeof(T *)); + + for (std::int64_t i = 0; i < batch_size; ++i) { + // Need to create a copy of input matrices "a" to keep them unchanged. + // Matrices "b" (copy of matrices "a") will be used as input and output + // parameter in oneapi::mkl::lapack::getri_batch call. + matrix_mem_copy(b_buffer_ptr + i * stride_b, host_a[i], ldb, lda, n, n, + dpct::device_to_device, exec_queue); + } + + { + auto b_buffer = get_buffer(b_buffer_ptr); + sycl::buffer scratchpad{sycl::range<1>(scratchpad_size)}; + sycl::buffer ipiv_buf( + sycl::range<1>(batch_size * stride_ipiv)); + auto from_buf = get_buffer(ipiv); + exec_queue.submit([&](sycl::handler &cgh) { + auto from_acc = from_buf.get_access(cgh); + auto to_acc = ipiv_buf.get_access(cgh); + cgh.parallel_for>( + sycl::range<2>(batch_size, n), [=](sycl::id<2> id) { + to_acc[id.get(0) * stride_ipiv + id.get(1)] = + static_cast(from_acc[id.get(0) * n + id.get(1)]); + }); + }); + + oneapi::mkl::lapack::getri_batch(exec_queue, n, b_buffer, ldb, stride_b, ipiv_buf, + stride_ipiv, batch_size, scratchpad, + scratchpad_size); + } + + // Copy back to the original buffers + std::vector events; + for (std::int64_t i = 0; i < batch_size; ++i) + events.push_back(detail::dpct_memcpy(exec_queue, host_b[i], + b_buffer_ptr + i * stride_b, + n * ldb * sizeof(T), automatic)); + std::vector ptrs{host_a, host_b}; + std::thread mem_free_thread( + [=](std::vector pointers_array, + std::vector events_array) { + sycl::event::wait(events_array); + for (auto p : pointers_array) + std::free(p); + }, + ptrs, events); + mem_free_thread.detach(); +#else + std::int64_t n_int64 = n; + std::int64_t ldb_int64 = ldb; + std::int64_t group_sizes = batch_size; + std::int64_t scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size( + exec_queue, &n_int64, &ldb_int64, 1, &group_sizes); + + Ty *scratchpad = sycl::malloc_device(scratchpad_size, exec_queue); + std::int64_t *ipiv_int64 = + sycl::malloc_device(batch_size * n, exec_queue); + std::int64_t **ipiv_int64_ptr = + sycl::malloc_shared(batch_size, exec_queue); + + exec_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for>( + sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) { + ipiv_int64[idx] = static_cast(ipiv[idx]); + }); + }); + + T **a_shared = sycl::malloc_shared(batch_size, exec_queue); + T **b_shared = sycl::malloc_shared(batch_size, exec_queue); + exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *)); + exec_queue.memcpy(b_shared, b, batch_size * sizeof(T *)).wait(); + for (std::int64_t i = 0; i < batch_size; ++i) { + ipiv_int64_ptr[i] = ipiv_int64 + n * i; + // Need to create a copy of input matrices "a" to keep them unchanged. + // Matrices "b" (copy of matrices "a") will be used as input and output + // parameter in oneapi::mkl::lapack::getri_batch call. + matrix_mem_copy(b_shared[i], a_shared[i], ldb, lda, n, n, dpct::device_to_device, + exec_queue); + } + + sycl::event e = oneapi::mkl::lapack::getri_batch( + exec_queue, &n_int64, (Ty **)b_shared, &ldb_int64, ipiv_int64_ptr, 1, + &group_sizes, scratchpad, scratchpad_size); + + std::vector ptrs{scratchpad, ipiv_int64_ptr, ipiv_int64, a_shared, b_shared}; + async_dpct_free(ptrs, {e}, exec_queue); +#endif +} + +/// Computes the QR factorizations of a batch of general matrices. +/// \param [in] exec_queue The queue where the routine should be executed. +/// \param [in] m The number of rows in the matrices. +/// \param [in] n The number of columns in the matrices. +/// \param [in, out] a Array of pointers to matrices. These +/// matrices will be overwritten by the factorization data. +/// \param [in] lda The leading dimension of the matrices in \p a. +/// \param [out] tau An array stores the scalars. +/// \param [out] info A value stores the error information. +/// \param [in] batch_size The size of the batch. +template +inline void geqrf_batch_wrapper(sycl::queue exec_queue, int m, int n, + T *a[], int lda, T *tau[], int *info, + int batch_size) { + using Ty = typename DataType::T2; + // Set the info value to 0 + *info = 0; +#ifdef DPCT_USM_LEVEL_NONE + std::int64_t stride_a = n * lda; + std::int64_t stride_tau = std::max(1, std::min(m, n)); + std::int64_t scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size( + exec_queue, m, n, lda, stride_a, stride_tau, batch_size); + + T *a_buffer_ptr, *tau_buffer_ptr; + a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T)); + tau_buffer_ptr = (T *)dpct_malloc(stride_tau * batch_size * sizeof(T)); + + T **host_a = (T **)std::malloc(batch_size * sizeof(T *)); + T **host_tau = (T **)std::malloc(batch_size * sizeof(T *)); + dpct_memcpy(host_a, a, batch_size * sizeof(T *)); + dpct_memcpy(host_tau, tau, batch_size * sizeof(T *)); + + for (std::int64_t i = 0; i < batch_size; ++i) + dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T)); + { + auto a_buffer = get_buffer(a_buffer_ptr); + auto tau_buffer = get_buffer(tau_buffer_ptr); + sycl::buffer scratchpad{sycl::range<1>(scratchpad_size)}; + oneapi::mkl::lapack::geqrf_batch(exec_queue, m, n, a_buffer, lda, stride_a, + tau_buffer, stride_tau, batch_size, scratchpad, + scratchpad_size); + } + + // Copy back to the original buffers + std::vector events_a; + std::vector events_tau; + for (std::int64_t i = 0; i < batch_size; ++i) { + events_a.push_back(detail::dpct_memcpy(exec_queue, host_a[i], + a_buffer_ptr + i * stride_a, + n * lda * sizeof(T), automatic)); + events_tau.push_back(detail::dpct_memcpy( + exec_queue, host_tau[i], tau_buffer_ptr + i * stride_tau, + std::max(1, std::min(m, n)) * sizeof(T), automatic)); + } + std::vector ptr_a{host_a}; + std::vector ptr_tau{host_tau}; + std::thread mem_free_thread_a( + [=](std::vector pointers_array, + std::vector events_array) { + sycl::event::wait(events_array); + for (auto p : pointers_array) + std::free(p); + }, + ptr_a, events_a); + std::thread mem_free_thread_tau( + [=](std::vector pointers_array, + std::vector events_array) { + sycl::event::wait(events_array); + for (auto p : pointers_array) + std::free(p); + }, + ptr_tau, events_tau); + mem_free_thread_a.detach(); + mem_free_thread_tau.detach(); +#else + std::int64_t m_int64 = n; + std::int64_t n_int64 = n; + std::int64_t lda_int64 = lda; + std::int64_t group_sizes = batch_size; + std::int64_t scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size( + exec_queue, &m_int64, &n_int64, &lda_int64, 1, &group_sizes); + + Ty *scratchpad = sycl::malloc_device(scratchpad_size, exec_queue); + T **a_shared = sycl::malloc_shared(batch_size, exec_queue); + T **tau_shared = sycl::malloc_shared(batch_size, exec_queue); + exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *)); + exec_queue.memcpy(tau_shared, tau, batch_size * sizeof(T *)).wait(); + + sycl::event e = oneapi::mkl::lapack::geqrf_batch( + exec_queue, &m_int64, &n_int64, (Ty **)a_shared, &lda_int64, (Ty **)tau_shared, 1, + &group_sizes, scratchpad, scratchpad_size); + + std::vector ptrs{scratchpad, a_shared, tau_shared}; + async_dpct_free(ptrs, {e}, exec_queue); +#endif +} + +/// Computes the Euclidean norm of a vector. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] n Number of elements in vector x. +/// \param [in] x Input vector x. +/// \param [in] x_type Data type of the vector x. +/// \param [in] incx Stride of vector x. +/// \param [out] result The result scalar. +/// \param [in] result_type Data type of the result. +inline void nrm2(sycl::queue &q, int n, const void *x, library_data_t x_type, + int incx, void *result, library_data_t result_type) { + std::uint64_t key = detail::get_type_combination_id(x_type, result_type); + switch (key) { + case detail::get_type_combination_id(library_data_t::real_float, + library_data_t::real_float): { + detail::nrm2_impl(q, n, x, incx, result); + break; + } + case detail::get_type_combination_id(library_data_t::real_double, + library_data_t::real_double): { + detail::nrm2_impl(q, n, x, incx, result); + break; + } + case detail::get_type_combination_id(library_data_t::complex_float, + library_data_t::real_float): { + detail::nrm2_impl, float>( + q, n, x, incx, result); + break; + } + case detail::get_type_combination_id(library_data_t::complex_double, + library_data_t::real_double): { + detail::nrm2_impl, double>( + q, n, x, incx, result); + break; + } + case detail::get_type_combination_id(library_data_t::real_half, + library_data_t::real_half): { + detail::nrm2_impl( + q, n, x, incx, result); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +} + +/// Computes the dot product of two vectors. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] n Number of elements in vector x. +/// \param [in] x Input vector x. +/// \param [in] x_type Data type of the vector x. +/// \param [in] incx Stride of vector x. +/// \param [in] y Input vector y. +/// \param [in] y_type Data type of the vector y. +/// \param [in] incy Stride of vector y. +/// \param [out] result The result scalar. +/// \param [in] result_type Data type of the result. +inline void dot(sycl::queue &q, int n, const void *x, library_data_t x_type, + int incx, const void *y, library_data_t y_type, int incy, + void *result, library_data_t result_type) { + detail::dotuc(q, n, x, x_type, incx, y, y_type, incy, result, + result_type); +} + +/// Computes the dot product of two vectors, conjugating the first vector. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] n Number of elements in vector x. +/// \param [in] x Input vector x. +/// \param [in] x_type Data type of the vector x. +/// \param [in] incx Stride of vector x. +/// \param [in] y Input vector y. +/// \param [in] y_type Data type of the vector y. +/// \param [in] incy Stride of vector y. +/// \param [out] result The result scalar. +/// \param [in] result_type Data type of the result. +inline void dotc(sycl::queue &q, int n, const void *x, library_data_t x_type, + int incx, const void *y, library_data_t y_type, int incy, + void *result, library_data_t result_type) { + detail::dotuc(q, n, x, x_type, incx, y, y_type, incy, result, + result_type); +} + +/// Computes the product of a vector by a scalar. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] n Number of elements in vector x. +/// \param [in] alpha The scale factor alpha. +/// \param [in] alpha_type The data type of alpha. +/// \param [in, out] x Input/Output vector x. +/// \param [in] x_type Data type of the vector x. +/// \param [in] incx Stride of vector x. +inline void scal(sycl::queue &q, int n, const void *alpha, + library_data_t alpha_type, void *x, library_data_t x_type, + int incx) { + std::uint64_t key = detail::get_type_combination_id(x_type); + switch (key) { + case detail::get_type_combination_id(library_data_t::real_float): { + detail::scal_impl(q, n, alpha, x, incx); + break; + } + case detail::get_type_combination_id(library_data_t::real_double): { + detail::scal_impl(q, n, alpha, x, incx); + break; + } + case detail::get_type_combination_id(library_data_t::complex_float): { + detail::scal_impl, std::complex>(q, n, alpha, + x, incx); + break; + } + case detail::get_type_combination_id(library_data_t::complex_double): { + detail::scal_impl, std::complex>( + q, n, alpha, x, incx); + break; + } + case detail::get_type_combination_id(library_data_t::real_half): { + float alpha_value = + dpct::get_value(reinterpret_cast(alpha), q); + sycl::half alaph_half(alpha_value); + detail::scal_impl(q, n, &alaph_half, x, incx); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +} + +/// Computes a vector-scalar product and adds the result to a vector. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] n Number of elements in vector x. +/// \param [in] alpha The scale factor alpha. +/// \param [in] alpha_type The data type of alpha. +/// \param [in] x Input vector x. +/// \param [in] x_type Data type of the vector x. +/// \param [in] incx Stride of vector x. +/// \param [in, out] y Input/Output vector y. +/// \param [in] y_type Data type of the vector y. +/// \param [in] incy Stride of vector y. +inline void axpy(sycl::queue &q, int n, const void *alpha, + library_data_t alpha_type, const void *x, library_data_t x_type, + int incx, void *y, library_data_t y_type, int incy) { + std::uint64_t key = detail::get_type_combination_id(x_type, alpha_type); + switch (key) { + case detail::get_type_combination_id(library_data_t::real_float, + library_data_t::real_float): { + detail::axpy_impl(q, n, alpha, x, incx, y, incy); + break; + } + case detail::get_type_combination_id(library_data_t::real_double, + library_data_t::real_double): { + detail::axpy_impl(q, n, alpha, x, incx, y, incy); + break; + } + case detail::get_type_combination_id(library_data_t::complex_float, + library_data_t::complex_float): { + detail::axpy_impl, std::complex>( + q, n, alpha, x, incx, y, incy); + break; + } + case detail::get_type_combination_id(library_data_t::complex_double, + library_data_t::complex_double): { + detail::axpy_impl, std::complex>( + q, n, alpha, x, incx, y, incy); + break; + } + case detail::get_type_combination_id(library_data_t::real_half, + library_data_t::real_float): { + float alpha_value = + dpct::get_value(reinterpret_cast(alpha), q); + sycl::half alaph_half(alpha_value); + detail::axpy_impl(q, n, &alaph_half, x, incx, y, incy); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +} + +/// Performs rotation of points in the plane. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] n Number of elements in vector x. +/// \param [in, out] x Input/Output vector x. +/// \param [in] x_type Data type of the vector x. +/// \param [in] incx Stride of vector x. +/// \param [in, out] y Input/Output vector y. +/// \param [in] y_type Data type of the vector y. +/// \param [in] incy Stride of vector y. +/// \param [in] c Scaling factor. +/// \param [in] s Scaling factor. +/// \param [in] cs_type Data type of the scaling factors. +inline void rot(sycl::queue &q, int n, void *x, library_data_t x_type, + int incx, void *y, library_data_t y_type, int incy, + const void *c, const void *s, library_data_t cs_type) { + std::uint64_t key = detail::get_type_combination_id(x_type, cs_type); + switch (key) { + case detail::get_type_combination_id(library_data_t::real_float, + library_data_t::real_float): { + detail::rot_impl(q, n, x, incx, y, incy, c, s); + break; + } + case detail::get_type_combination_id(library_data_t::real_double, + library_data_t::real_double): { + detail::rot_impl(q, n, x, incx, y, incy, c, s); + break; + } + case detail::get_type_combination_id(library_data_t::complex_float, + library_data_t::real_float): { + detail::rot_impl, float, float>(q, n, x, incx, y, incy, c, + s); + break; + } + case detail::get_type_combination_id(library_data_t::complex_double, + library_data_t::real_double): { + detail::rot_impl, double, double>(q, n, x, incx, y, incy, c, + s); + break; + } + case detail::get_type_combination_id(library_data_t::complex_float, + library_data_t::complex_float): { + detail::rot_impl, float, std::complex>(q, n, x, incx, y, incy, c, s); + break; + } + case detail::get_type_combination_id(library_data_t::complex_double, + library_data_t::complex_double): { + detail::rot_impl, double, std::complex>(q, n, x, incx, y, incy, c, s); + break; + } + case detail::get_type_combination_id(library_data_t::real_half, + library_data_t::real_half): { + detail::rot_impl(q, n, x, incx, y, incy, c, s); + break; + } + case detail::get_type_combination_id(library_data_t::real_bfloat16, + library_data_t::real_bfloat16): { + detail::rot_impl(q, n, x, incx, y, incy, c, s); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +} + +/// Computes matrix-matrix product with general matrices. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] a_trans Specifies the operation applied to A. +/// \param [in] b_trans Specifies the operation applied to B. +/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. +/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. +/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). +/// \param [in] alpha Scaling factor for the matrix-matrix product. +/// \param [in] a Input matrix A. +/// \param [in] a_type Data type of the matrix A. +/// \param [in] lda Leading dimension of A. +/// \param [in] b Input matrix B. +/// \param [in] b_type Data type of the matrix B. +/// \param [in] ldb Leading dimension of B. +/// \param [in] beta Scaling factor for matrix C. +/// \param [in, out] c Input/Output matrix C. +/// \param [in] c_type Data type of the matrix C. +/// \param [in] ldc Leading dimension of C. +/// \param [in] scaling_type Data type of the scaling factors. +inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void *a, library_data_t a_type, + int lda, const void *b, library_data_t b_type, int ldb, + const void *beta, void *c, library_data_t c_type, int ldc, + library_data_t scaling_type) { + bool matched = false; + if (scaling_type == library_data_t::real_float && + c_type == library_data_t::complex_float) { + scaling_type = library_data_t::complex_float; + } else if (scaling_type == library_data_t::real_double && + c_type == library_data_t::complex_double) { + scaling_type = library_data_t::complex_double; + } + + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_float, library_data_t::complex_float, + library_data_t::complex_float, library_data_t::complex_float): { + detail::gemm_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_double, library_data_t::complex_double, + library_data_t::complex_double, library_data_t::complex_double): { + detail::gemm_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): { + detail::gemm_impl(q, a_trans, b_trans, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, + ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): { + float alpha_value = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_impl(q, a_trans, b_trans, m, n, k, &alpha_half, + a, lda, b, ldb, &beta_half, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): { + float alpha_float = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_float = + dpct::get_value(reinterpret_cast(beta), q); + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +} + +/// Computes a batch of matrix-matrix product with general matrices. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] a_trans Specifies the operation applied to A. +/// \param [in] b_trans Specifies the operation applied to B. +/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. +/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. +/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). +/// \param [in] alpha Scaling factor for the matrix-matrix product. +/// \param [in] a Input matrix A. +/// \param [in] a_type Data type of the matrix A. +/// \param [in] lda Leading dimension of A. +/// \param [in] b Input matrix B. +/// \param [in] b_type Data type of the matrix B. +/// \param [in] ldb Leading dimension of B. +/// \param [in] beta Scaling factor for matrix C. +/// \param [in, out] c Input/Output matrix C. +/// \param [in] c_type Data type of the matrix C. +/// \param [in] ldc Leading dimension of C. +/// \param [in] batch_size Specifies the number of matrix multiply operations to perform. +/// \param [in] scaling_type Data type of the scaling factors. +inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void *a[], + library_data_t a_type, int lda, const void *b[], + library_data_t b_type, int ldb, const void *beta, + void *c[], library_data_t c_type, int ldc, + int batch_size, library_data_t scaling_type) { +#ifdef DPCT_USM_LEVEL_NONE + throw std::runtime_error("this API is unsupported when USM level is none"); +#else + bool matched = false; + if (scaling_type == library_data_t::real_float && + c_type == library_data_t::complex_float) { + scaling_type = library_data_t::complex_float; + } else if (scaling_type == library_data_t::real_double && + c_type == library_data_t::complex_double) { + scaling_type = library_data_t::complex_double; + } + + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_float, library_data_t::complex_float, + library_data_t::complex_float, library_data_t::complex_float): { + detail::gemm_batch_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_double, library_data_t::complex_double, + library_data_t::complex_double, library_data_t::complex_double): { + detail::gemm_batch_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, + a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } +#ifdef __INTEL_MKL__ + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, + b, ldb, beta, c, ldc, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): { + float alpha_float = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_float = + dpct::get_value(reinterpret_cast(beta), q); + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, &alpha_float, + a, lda, b, ldb, &beta_float, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + batch_size); + break; + } +#endif + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): { + float alpha_value = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, + batch_size); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +#endif +} + +/// Computes a batch of matrix-matrix product with general matrices. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] a_trans Specifies the operation applied to A. +/// \param [in] b_trans Specifies the operation applied to B. +/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. +/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. +/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). +/// \param [in] alpha Scaling factor for the matrix-matrix product. +/// \param [in] a Input matrix A. +/// \param [in] a_type Data type of the matrix A. +/// \param [in] lda Leading dimension of A. +/// \param [in] stride_a Stride between the different A matrices. +/// \param [in] b Input matrix B. +/// \param [in] b_type Data type of the matrix B. +/// \param [in] ldb Leading dimension of B. +/// \param [in] stride_b Stride between the different B matrices. +/// \param [in] beta Scaling factor for matrix C. +/// \param [in, out] c Input/Output matrix C. +/// \param [in] c_type Data type of the matrix C. +/// \param [in] ldc Leading dimension of C. +/// \param [in] stride_c Stride between the different C matrices. +/// \param [in] batch_size Specifies the number of matrix multiply operations to perform. +/// \param [in] scaling_type Data type of the scaling factors. +inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans, + oneapi::mkl::transpose b_trans, int m, int n, int k, + const void *alpha, const void *a, library_data_t a_type, + int lda, long long int stride_a, const void *b, + library_data_t b_type, int ldb, long long int stride_b, + const void *beta, void *c, library_data_t c_type, + int ldc, long long int stride_c, int batch_size, + library_data_t scaling_type) { + bool matched = false; + if (scaling_type == library_data_t::real_float && + c_type == library_data_t::complex_float) { + scaling_type = library_data_t::complex_float; + } else if (scaling_type == library_data_t::real_double && + c_type == library_data_t::complex_double) { + scaling_type = library_data_t::complex_double; + } + + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_float, library_data_t::complex_float, + library_data_t::complex_float, library_data_t::complex_float): { + detail::gemm_batch_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_double, library_data_t::complex_double, + library_data_t::complex_double, library_data_t::complex_double): { + detail::gemm_batch_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, + a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } +#ifdef __INTEL_MKL__ + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, + stride_a, b, ldb, stride_b, beta, c, ldc, + stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, + a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } +#endif + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): { + float alpha_value = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b, + &beta_half, c, ldc, stride_c, batch_size); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +} + +/// This routines perform a special rank-k update of a symmetric matrix C by +/// general matrices A and B. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] uplo Specifies whether C's data is stored in its upper or lower triangle. +/// \param [in] trans Specifies the operation to apply. +/// \param [in] n The number of rows and columns in C. +/// \param [in] k The inner dimension of matrix multiplications. +/// \param [in] alpha Scaling factor for the rank-k update. +/// \param [in] a Input matrix A. +/// \param [in] lda Leading dimension of A. +/// \param [in] b Input matrix B. +/// \param [in] ldb Leading dimension of B. +/// \param [in] beta Scaling factor for the rank-k update. +/// \param [in, out] c Input/Output matrix C. +/// \param [in] ldc Leading dimension of C. +template +inline void syrk(sycl::queue &q, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, int n, int k, const T *alpha, + const T *a, int lda, const T *b, int ldb, const T *beta, T *c, + int ldc) { + detail::rk_impl(q, uplo, trans, n, k, alpha, a, lda, b, + ldb, beta, c, ldc); +} + +/// This routines perform a special rank-k update of a Hermitian matrix C by +/// general matrices A and B. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] uplo Specifies whether C's data is stored in its upper or lower triangle. +/// \param [in] trans Specifies the operation to apply. +/// \param [in] n The number of rows and columns in C. +/// \param [in] k The inner dimension of matrix multiplications. +/// \param [in] alpha Scaling factor for the rank-k update. +/// \param [in] a Input matrix A. +/// \param [in] lda Leading dimension of A. +/// \param [in] b Input matrix B. +/// \param [in] ldb Leading dimension of B. +/// \param [in] beta Scaling factor for the rank-k update. +/// \param [in, out] c Input/Output matrix C. +/// \param [in] ldc Leading dimension of C. +template +inline void herk(sycl::queue &q, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, int n, int k, const T *alpha, + const T *a, int lda, const T *b, int ldb, const Tbeta *beta, + T *c, int ldc) { + detail::rk_impl(q, uplo, trans, n, k, alpha, a, lda, b, + ldb, beta, c, ldc); +} + +/// This routine performs a group of trsm operations. Each trsm solves an +/// equation of the form op(A) * X = alpha * B or X * op(A) = alpha * B. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] left_right Specifies A multiplies X on the left or on the right. +/// \param [in] upper_lower Specifies A is upper or lower triangular. +/// \param [in] trans Specifies the operation applied to A. +/// \param [in] unit_diag Specifies whether A is unit triangular. +/// \param [in] m Number of rows of the B matrices. +/// \param [in] n Number of columns of the B matrices. +/// \param [in] alpha Scaling factor for the solutions. +/// \param [in] a Input matrices A. +/// \param [in] a_type Data type of the matrices A. +/// \param [in] lda Leading dimension of the matrices A. +/// \param [in, out] b Input and output matrices B. +/// \param [in] b_type Data type of the matrices B. +/// \param [in] ldb Leading dimension of the matrices B. +/// \param [in] batch_size Specifies the number of trsm operations to perform. +/// \param [in] scaling_type Data type of the scaling factors. +inline void trsm_batch(sycl::queue &q, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, int m, int n, + const void *alpha, const void **a, library_data_t a_type, + int lda, void **b, library_data_t b_type, int ldb, + int batch_size, library_data_t scaling_type) { +#ifdef DPCT_USM_LEVEL_NONE + throw std::runtime_error("this API is unsupported when USM level is none"); +#else + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, scaling_type); + switch (key) { + case detail::get_type_combination_id(library_data_t::real_float, + library_data_t::real_float, + library_data_t::real_float): { + detail::trsm_batch_impl(q, left_right, upper_lower, + trans, unit_diag, m, n, alpha, + a, lda, b, ldb, batch_size); + break; + } + case detail::get_type_combination_id(library_data_t::real_double, + library_data_t::real_double, + library_data_t::real_double): { + detail::trsm_batch_impl( + q, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, + ldb, batch_size); + break; + } + case detail::get_type_combination_id(library_data_t::complex_float, + library_data_t::complex_float, + library_data_t::complex_float): { + detail::trsm_batch_impl, std::complex, + std::complex>(q, left_right, upper_lower, + trans, unit_diag, m, n, alpha, + a, lda, b, ldb, batch_size); + break; + } + case detail::get_type_combination_id(library_data_t::complex_double, + library_data_t::complex_double, + library_data_t::complex_double): { + detail::trsm_batch_impl, std::complex, + std::complex>(q, left_right, upper_lower, + trans, unit_diag, m, n, alpha, + a, lda, b, ldb, batch_size); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } +#endif +} + +/// Computes a triangular matrix-general matrix product. +/// \param [in] q The queue where the routine should be executed. +/// \param [in] left_right Specifies A is on the left or right side of the +/// multiplication. +/// \param [in] upper_lower Specifies A is upper or lower triangular. +/// \param [in] trans Specifies the operation applied to A. +/// \param [in] unit_diag Specifies whether A is unit triangular. +/// \param [in] m Number of rows of B. +/// \param [in] n Number of columns of B. +/// \param [in] alpha Scaling factor for the matrix-matrix product. +/// \param [in] a Input matrices A. +/// \param [in] lda Leading dimension of the matrices A. +/// \param [in] b Input matrices B. +/// \param [in] ldb Leading dimension of the matrices B. +/// \param [out] c Output matrices C. +/// \param [in] ldc Leading dimension of the matrices C. +template +inline void trmm(sycl::queue &q, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, int m, int n, const T *alpha, + const T *a, int lda, const T *b, int ldb, T *c, int ldc) { + using Ty = typename DataType::T2; + auto alpha_val = dpct::get_value(alpha, q); + if (b != c) { + dpct::matrix_mem_copy(c, b, ldc, ldb, m, n, dpct::device_to_device, q); + } + auto data_a = detail::get_memory(a); + auto data_c = detail::get_memory(c); + oneapi::mkl::blas::column_major::trmm(q, left_right, upper_lower, trans, + unit_diag, m, n, alpha_val, data_a, lda, + data_c, ldc); +} + +} // namespace dpct +#endif // __DPCT_BLAS_UTILS_HPP__ diff --git a/dpct/ccl_utils.hpp b/dpct/ccl_utils.hpp new file mode 100644 index 0000000000000..07b3488c937da --- /dev/null +++ b/dpct/ccl_utils.hpp @@ -0,0 +1,286 @@ +//==---- ccl_utils.hpp----------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_CCL_UTILS_HPP__ +#define __DPCT_CCL_UTILS_HPP__ + +#include +#include +#include +#include + +#include "device.hpp" + +namespace dpct { +namespace ccl { +namespace detail { + +/// Get stored kvs with specified kvs address. +inline std::shared_ptr & +get_kvs(const oneapi::ccl::kvs::address_type &addr) { + struct hash { + std::size_t operator()(const oneapi::ccl::kvs::address_type &in) const { + return std::hash()(std::string_view(in.data(), in.size())); + } + }; + static std::unordered_map, hash> + kvs_map; + return kvs_map[addr]; +} + +/// Help class to init ccl environment. +class ccl_init_helper { +public: + ccl_init_helper() { oneapi::ccl::init(); } +}; + +} // namespace detail + +/// Get concatenated library version as an integer. +static inline int get_version() { + oneapi::ccl::init(); + auto ver = oneapi::ccl::get_library_version(); + return ver.major * 10000 + ver.minor * 100 + ver.update; +} + +/// Create main kvs and return its address. +static inline oneapi::ccl::kvs::address_type create_kvs_address() { + oneapi::ccl::init(); + auto ptr = oneapi::ccl::create_main_kvs(); + auto addr = ptr->get_address(); + detail::get_kvs(addr) = ptr; + return addr; +} + +/// Get stored kvs with /p addr if exist. Otherwise, create kvs with /p addr. +static inline std::shared_ptr +create_kvs(const oneapi::ccl::kvs::address_type &addr) { + oneapi::ccl::init(); + auto &ptr = detail::get_kvs(addr); + if (!ptr) + ptr = oneapi::ccl::create_kvs(addr); + return ptr; +} + +/// dpct communicator extension +class communicator_wrapper : public dpct::ccl::detail::ccl_init_helper { +public: + communicator_wrapper( + int size, int rank, oneapi::ccl::kvs::address_type id, + const oneapi::ccl::comm_attr &attr = oneapi::ccl::default_comm_attr) + : _device_comm(oneapi::ccl::create_device( + static_cast(dpct::get_current_device()))), + _context_comm(oneapi::ccl::create_context(dpct::get_default_context())), + _comm(oneapi::ccl::create_communicator( + size, rank, _device_comm, _context_comm, dpct::ccl::create_kvs(id), + attr)) { + _queue_init = false; + _ccl_stream_ptr = nullptr; + } + + ~communicator_wrapper() { + delete _ccl_stream_ptr; + }; + + /// Return the rank in a oneapi::ccl::communicator + /// \returns The rank corresponding to communicator object + int rank() const { + return _comm.rank(); + } + + /// Retrieves the number of rank in oneapi::ccl::communicator + /// \returns The number of the ranks + int size() const { + return _comm.size(); + } + + /// Return underlying native device, which was used in oneapi::ccl::communicator + sycl::device get_device() const { + return _comm.get_device().get_native(); + } + + /// \brief allreduce is a collective communication operation that performs the global reduction operation + /// on values from all ranks of communicator and distributes the result back to all ranks. + /// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced + /// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff + /// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff + /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff + /// \param rtype the type of the reduction operation to be applied + /// \param queue_ptr a sycl::queue ptr associated with the operation + /// \return @ref void + void allreduce(const void *sendbuff, void *recvbuff, size_t count, + oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype, + sycl::queue *queue_ptr) { + call_func_wrapper( + [=](const oneapi::ccl::stream &stream) { + return oneapi::ccl::allreduce(sendbuff, recvbuff, count, dtype, rtype, + _comm, stream); + }, + queue_ptr); + } + + /// \brief reduce is a collective communication operation that performs the + /// global reduction operation on values from all ranks of the communicator + /// and returns the result to the root rank. + /// \param sendbuff the buffer with @c count elements of @c dtype that stores + /// local data to be reduced + /// \param recvbuff [out] the buffer to store reduced result, + /// must have the same dimension as @c sendbuff + /// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff + /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff + /// \param root the rank that gets the result of reduction + /// \param rtype the type of the reduction operation to be applied + /// \param queue_ptr a sycl::queue ptr associated with the operation + /// \return @ref void + void reduce(const void *sendbuff, void *recvbuff, size_t count, + oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype, + int root, sycl::queue *queue_ptr) { + call_func_wrapper( + [=](const oneapi::ccl::stream &stream) { + return oneapi::ccl::reduce(sendbuff, recvbuff, count, dtype, rtype, + root, _comm, stream); + }, + queue_ptr); + } + + /// \brief broadcast is a collective communication operation that broadcasts data + /// from one rank of communicator (denoted as root) to all other ranks. + /// Only support in-place operation + /// \param sendbuff the buffer with @c count elements of @c dtype that stores + /// local data to be reduced + /// \param recvbuff [out] the buffer to store reduced result + /// \param count the number of elements of type @c dtype in @c buf + /// \param dtype thedatatype of elements in @c buf + /// \param root the rank that broadcasts @c buf + /// \param queue_ptr a sycl::queue ptr associated with the operation + /// \return @ref void + void broadcast(void *sendbuff, void *recvbuff, size_t count, + oneapi::ccl::datatype dtype, int root, + sycl::queue *queue_ptr) { + if (sendbuff != recvbuff) { + throw std::runtime_error( + "oneCCL broadcast only support in-place operation. " + "sendbuff and recvbuff must be same."); + return; + } + call_func_wrapper( + [=](const oneapi::ccl::stream &stream) { + return oneapi::ccl::broadcast(recvbuff, count, dtype, root, _comm, + stream); + }, + queue_ptr); + } + + /// \brief reduce_scatter is a collective communication operation that performs the global reduction operation + /// on values from all ranks of the communicator and scatters the result in blocks back to all ranks. + /// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced + /// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff + /// \param recv_count the number of elements of type @c dtype in receive block + /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff + /// \param rtype the type of the reduction operation to be applied + /// \param queue_ptr a sycl::queue ptr associated with the operation + /// \return @ref void + void reduce_scatter(const void *sendbuff, void *recvbuff, size_t recv_count, + oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype, + sycl::queue *queue_ptr) { + call_func_wrapper( + [=](const oneapi::ccl::stream &stream) { + return oneapi::ccl::reduce_scatter(sendbuff, recvbuff, recv_count, + dtype, rtype, _comm, stream); + }, + queue_ptr); + } + + /// \brief send is a pt2pt communication operation that sends data from one rank of communicator. + /// \param sendbuff the buffer with @c count elements of @c dtype serves as send buffer for root + /// \param count the number of elements of type @c dtype in @c sendbuff + /// \param dtype the datatype of elements in @c sendbuff + /// \param peer the rank that receives @c sendbuff + /// \param queue_ptr a sycl::queue ptr associated with the operation + /// \return @ref void + void send(void *sendbuff, size_t count, oneapi::ccl::datatype dtype, int peer, + sycl::queue *queue_ptr) { + call_func_wrapper( + [=](const oneapi::ccl::stream &stream) { + return oneapi::ccl::send(sendbuff, count, dtype, peer, _comm, stream); + }, + queue_ptr); + } + + /// \brief recv is a pt2pt communication operation that sends data from one rank of communicator. + /// \param recvbuff the buffer with @c count elements of @c dtype serves as receive buffer + /// \param count the number of elements of type @c dtype in @c recvbuff + /// \param dtype the datatype of elements in @c recvbuff + /// \param peer the rank that receives @c recvbuff + /// \param queue_ptr a sycl::queue ptr associated with the operation + /// \return @ref void + void recv(void *recvbuff, size_t count, oneapi::ccl::datatype dtype, int peer, + sycl::queue *queue_ptr) { + call_func_wrapper( + [=](const oneapi::ccl::stream &stream) { + return oneapi::ccl::recv(recvbuff, count, dtype, peer, _comm, stream); + }, + queue_ptr); + } + +private: + oneapi::ccl::device _device_comm; + oneapi::ccl::context _context_comm; + oneapi::ccl::communicator _comm; + sycl::queue _queue; + bool _queue_init; + oneapi::ccl::stream *_ccl_stream_ptr; + + template + void call_func_wrapper(Fn func, sycl::queue *qptr) { + if (_queue_init && *qptr != _queue) { + call_func_async(func, qptr); + } else { + if(!_queue_init) { + _queue = *qptr; + _queue_init = true; + _ccl_stream_ptr = new oneapi::ccl::stream(oneapi::ccl::create_stream(_queue)); + } + std::invoke(func, *_ccl_stream_ptr); + } + } + + class call_func_async { + sycl::queue *_q_ptr; + struct call_async_impl { + oneapi::ccl::stream _ccl_stream_impl; + oneapi::ccl::event _ccl_event_impl; + template + explicit call_async_impl(Fn func, sycl::queue *qptr) + : _ccl_stream_impl(oneapi::ccl::create_stream(*qptr)), + _ccl_event_impl(std::invoke(func, _ccl_stream_impl)) {} + }; + call_async_impl *_imp; + + public: + template + explicit call_func_async(Fn func, sycl::queue *qptr) + : _q_ptr(qptr), + _imp(new call_async_impl(func, qptr)) {} + ~call_func_async() { + _q_ptr->submit([&](sycl::handler &cgh) + { cgh.host_task([=] + { + _imp->_ccl_event_impl.wait(); + delete _imp; }); }); + } + }; +}; + +typedef dpct::ccl::communicator_wrapper *comm_ptr; + +} // namespace ccl +} // namespace dpct + +#endif // __DPCT_CCL_UTILS_HPP__ \ No newline at end of file diff --git a/dpct/device.hpp b/dpct/device.hpp new file mode 100644 index 0000000000000..729ebf625a472 --- /dev/null +++ b/dpct/device.hpp @@ -0,0 +1,781 @@ +//==---- device.hpp -------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_DEVICE_HPP__ +#define __DPCT_DEVICE_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__linux__) +#include +#include +#endif +#if defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#endif + +namespace dpct { +namespace detail { +static void get_version(const sycl::device &dev, int &major, int &minor) { + // Version string has the following format: + // a. OpenCL + // b. + std::string ver; + ver = dev.get_info(); + std::string::size_type i = 0; + while (i < ver.size()) { + if (isdigit(ver[i])) + break; + i++; + } + major = std::stoi(&(ver[i])); + while (i < ver.size()) { + if (ver[i] == '.') + break; + i++; + } + i++; + minor = std::stoi(&(ver[i])); +} +} // namespace detail + +/// SYCL default exception handler +inline auto exception_handler = [](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cerr << "Caught asynchronous SYCL exception:" << std::endl + << e.what() << std::endl + << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + } + } +}; + +typedef sycl::event *event_ptr; + +typedef sycl::queue *queue_ptr; + +typedef char *device_ptr; + +/// Destroy \p event pointed memory. +/// +/// \param event Pointer to the sycl::event address. +static void destroy_event(event_ptr event) { + delete event; +} + +class device_info { +public: + // get interface + const char *get_name() const { return _name; } + char *get_name() { return _name; } + template , + std::enable_if_t> || + std::is_same_v, + int> = 0> + auto get_max_work_item_sizes() const { + if constexpr (std::is_same_v>) + return sycl::range<3>(_max_work_item_sizes_i[0], + _max_work_item_sizes_i[1], + _max_work_item_sizes_i[2]); + else { + return _max_work_item_sizes_i; + } + } + template , + std::enable_if_t> || + std::is_same_v, + int> = 0> + auto get_max_work_item_sizes() { + if constexpr (std::is_same_v>) + return sycl::range<3>(_max_work_item_sizes_i[0], + _max_work_item_sizes_i[1], + _max_work_item_sizes_i[2]); + else { + return _max_work_item_sizes_i; + } + } + bool get_host_unified_memory() const { return _host_unified_memory; } + int get_major_version() const { return _major; } + int get_minor_version() const { return _minor; } + int get_integrated() const { return _integrated; } + int get_max_clock_frequency() const { return _frequency; } + int get_max_compute_units() const { return _max_compute_units; } + int get_max_work_group_size() const { return _max_work_group_size; } + int get_max_sub_group_size() const { return _max_sub_group_size; } + int get_max_work_items_per_compute_unit() const { + return _max_work_items_per_compute_unit; + } + int get_max_register_size_per_work_group() const { + return _max_register_size_per_work_group; + } + template || + std::is_same_v, + int> = 0> + auto get_max_nd_range_size() const { + if constexpr (std::is_same_v) + return _max_nd_range_size; + else + return _max_nd_range_size_i; + } + template || + std::is_same_v, + int> = 0> + auto get_max_nd_range_size() { + if constexpr (std::is_same_v) + return _max_nd_range_size; + else + return _max_nd_range_size_i; + } + size_t get_global_mem_size() const { return _global_mem_size; } + size_t get_local_mem_size() const { return _local_mem_size; } + /// Returns the maximum clock rate of device's global memory in kHz. If + /// compiler does not support this API then returns default value 3200000 kHz. + unsigned int get_memory_clock_rate() const { return _memory_clock_rate; } + /// Returns the maximum bus width between device and memory in bits. If + /// compiler does not support this API then returns default value 64 bits. + unsigned int get_memory_bus_width() const { return _memory_bus_width; } + uint32_t get_device_id() const { return _device_id; } + std::array get_uuid() const { return _uuid; } + /// Returns global memory cache size in bytes. + unsigned int get_global_mem_cache_size() const { + return _global_mem_cache_size; + } + + // set interface + void set_name(const char* name) { + size_t length = strlen(name); + if (length < 256) { + std::memcpy(_name, name, length + 1); + } else { + std::memcpy(_name, name, 255); + _name[255] = '\0'; + } + } + void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) { + for (int i = 0; i < 3; ++i) + _max_work_item_sizes_i[i] = max_work_item_sizes[i]; + } + [[deprecated]] void + set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) { + for (int i = 0; i < 3; ++i) { + _max_work_item_sizes_i[i] = max_work_item_sizes[i]; + } + } + void set_host_unified_memory(bool host_unified_memory) { + _host_unified_memory = host_unified_memory; + } + void set_major_version(int major) { _major = major; } + void set_minor_version(int minor) { _minor = minor; } + void set_integrated(int integrated) { _integrated = integrated; } + void set_max_clock_frequency(int frequency) { _frequency = frequency; } + void set_max_compute_units(int max_compute_units) { + _max_compute_units = max_compute_units; + } + void set_global_mem_size(size_t global_mem_size) { + _global_mem_size = global_mem_size; + } + void set_local_mem_size(size_t local_mem_size) { + _local_mem_size = local_mem_size; + } + void set_max_work_group_size(int max_work_group_size) { + _max_work_group_size = max_work_group_size; + } + void set_max_sub_group_size(int max_sub_group_size) { + _max_sub_group_size = max_sub_group_size; + } + void + set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) { + _max_work_items_per_compute_unit = max_work_items_per_compute_unit; + } + void set_max_nd_range_size(int max_nd_range_size[]) { + for (int i = 0; i < 3; i++) { + _max_nd_range_size[i] = max_nd_range_size[i]; + _max_nd_range_size_i[i] = max_nd_range_size[i]; + } + } + void set_memory_clock_rate(unsigned int memory_clock_rate) { + _memory_clock_rate = memory_clock_rate; + } + void set_memory_bus_width(unsigned int memory_bus_width) { + _memory_bus_width = memory_bus_width; + } + void + set_max_register_size_per_work_group(int max_register_size_per_work_group) { + _max_register_size_per_work_group = max_register_size_per_work_group; + } + void set_device_id(uint32_t device_id) { + _device_id = device_id; + } + void set_uuid(std::array uuid) { + _uuid = std::move(uuid); + } + void set_global_mem_cache_size(unsigned int global_mem_cache_size) { + _global_mem_cache_size = global_mem_cache_size; + } + +private: + char _name[256]; + int _max_work_item_sizes_i[3]; + bool _host_unified_memory = false; + int _major; + int _minor; + int _integrated = 0; + int _frequency; + // Set estimated value 3200000 kHz as default value. + unsigned int _memory_clock_rate = 3200000; + // Set estimated value 64 bits as default value. + unsigned int _memory_bus_width = 64; + unsigned int _global_mem_cache_size; + int _max_compute_units; + int _max_work_group_size; + int _max_sub_group_size; + int _max_work_items_per_compute_unit; + int _max_register_size_per_work_group; + size_t _global_mem_size; + size_t _local_mem_size; + size_t _max_nd_range_size[3]; + int _max_nd_range_size_i[3]; + uint32_t _device_id; + std::array _uuid; +}; + +static int get_major_version(const sycl::device &dev) { + int major, minor; + detail::get_version(dev, major, minor); + return major; +} + +static int get_minor_version(const sycl::device &dev) { + int major, minor; + detail::get_version(dev, major, minor); + return minor; +} + +static void get_device_info(device_info &out, const sycl::device &dev) { + device_info prop; + prop.set_name(dev.get_info().c_str()); + + int major, minor; + detail::get_version(dev, major, minor); + prop.set_major_version(major); + prop.set_minor_version(minor); + + prop.set_max_work_item_sizes( +#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902) + // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes + // is an enum class element + dev.get_info()); +#else + // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by + // an int + dev.get_info>()); +#endif + prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations)); + + prop.set_max_clock_frequency( + dev.get_info() * 1000); + + prop.set_max_compute_units( + dev.get_info()); + prop.set_max_work_group_size( + dev.get_info()); + prop.set_global_mem_size(dev.get_info()); + prop.set_local_mem_size(dev.get_info()); + +#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6) + if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) { + unsigned int tmp = + dev.get_info(); + if (tmp != 0) + prop.set_memory_clock_rate(1000 * tmp); + } + if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) { + prop.set_memory_bus_width( + dev.get_info()); + } + if (dev.has(sycl::aspect::ext_intel_device_id)) { + prop.set_device_id( + dev.get_info()); + } + if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) { + prop.set_uuid(dev.get_info()); + } +#elif defined(_MSC_VER) && !defined(__clang__) +#pragma message("get_device_info: querying memory_clock_rate and \ +memory_bus_width are not supported by the compiler used. \ +Use 3200000 kHz as memory_clock_rate default value. \ +Use 64 bits as memory_bus_width default value.") +#else +#warning "get_device_info: querying memory_clock_rate and \ +memory_bus_width are not supported by the compiler used. \ +Use 3200000 kHz as memory_clock_rate default value. \ +Use 64 bits as memory_bus_width default value." +#endif + + size_t max_sub_group_size = 1; + std::vector sub_group_sizes = + dev.get_info(); + + for (const auto &sub_group_size : sub_group_sizes) { + if (max_sub_group_size < sub_group_size) + max_sub_group_size = sub_group_size; + } + + prop.set_max_sub_group_size(max_sub_group_size); + + prop.set_max_work_items_per_compute_unit( + dev.get_info()); + int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + prop.set_max_nd_range_size(max_nd_range_size); + + // Estimates max register size per work group, feel free to update the value + // according to device properties. + prop.set_max_register_size_per_work_group(65536); + + prop.set_global_mem_cache_size( + dev.get_info()); + out = prop; +} + +/// dpct device extension +class device_ext : public sycl::device { + typedef std::mutex mutex_type; + +public: + device_ext() : sycl::device(), _ctx(*this) {} + ~device_ext() { + std::lock_guard lock(m_mutex); + clear_queues(); + } + device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this) { + std::lock_guard lock(m_mutex); + init_queues(); + } + + int is_native_atomic_supported() { return 0; } + int get_major_version() const { + return dpct::get_major_version(*this); + } + + int get_minor_version() const { + return dpct::get_minor_version(*this); + } + + int get_max_compute_units() const { + return get_device_info().get_max_compute_units(); + } + + /// Return the maximum clock frequency of this device in KHz. + int get_max_clock_frequency() const { + return get_device_info().get_max_clock_frequency(); + } + + int get_integrated() const { return get_device_info().get_integrated(); } + + int get_max_sub_group_size() const { + return get_device_info().get_max_sub_group_size(); + } + + int get_max_register_size_per_work_group() const { + return get_device_info().get_max_register_size_per_work_group(); + } + + int get_max_work_group_size() const { + return get_device_info().get_max_work_group_size(); + } + + int get_mem_base_addr_align() const { + return get_info(); + } + + size_t get_global_mem_size() const { + return get_device_info().get_global_mem_size(); + } + + /// Get the number of bytes of free and total memory on the SYCL device. + /// \param [out] free_memory The number of bytes of free memory on the SYCL device. + /// \param [out] total_memory The number of bytes of total memory on the SYCL device. + void get_memory_info(size_t &free_memory, size_t &total_memory) { +#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) + if (!has(sycl::aspect::ext_intel_free_memory)) { + std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl; + free_memory = 0; + } else { + free_memory = get_info(); + } +#else + std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl; + free_memory = 0; +#if defined(_MSC_VER) && !defined(__clang__) +#pragma message("Querying the number of bytes of free memory is not supported") +#else +#warning "Querying the number of bytes of free memory is not supported" +#endif +#endif + total_memory = get_device_info().get_global_mem_size(); + } + + void get_device_info(device_info &out) const { + dpct::get_device_info(out, *this); + } + + device_info get_device_info() const { + device_info prop; + dpct::get_device_info(prop, *this); + return prop; + } + + void reset() { + std::lock_guard lock(m_mutex); + clear_queues(); + init_queues(); + } + + sycl::queue &in_order_queue() { return *_q_in_order; } + + sycl::queue &out_of_order_queue() { return *_q_out_of_order; } + + sycl::queue &default_queue() { +#ifdef DPCT_USM_LEVEL_NONE + return out_of_order_queue(); +#else + return in_order_queue(); +#endif // DPCT_USM_LEVEL_NONE + } + + void queues_wait_and_throw() { + std::unique_lock lock(m_mutex); + std::vector> current_queues( + _queues); + lock.unlock(); + for (const auto &q : current_queues) { + q->wait_and_throw(); + } + // Guard the destruct of current_queues to make sure the ref count is safe. + lock.lock(); + } + + sycl::queue *create_queue(bool enable_exception_handler = false) { +#ifdef DPCT_USM_LEVEL_NONE + return create_out_of_order_queue(enable_exception_handler); +#else + return create_in_order_queue(enable_exception_handler); +#endif // DPCT_USM_LEVEL_NONE + } + + sycl::queue *create_in_order_queue(bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(enable_exception_handler, + sycl::property::queue::in_order()); + } + + sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(enable_exception_handler); + } + + void destroy_queue(sycl::queue *&queue) { + std::lock_guard lock(m_mutex); + _queues.erase(std::remove_if(_queues.begin(), _queues.end(), + [=](const std::shared_ptr &q) -> bool { + return q.get() == queue; + }), + _queues.end()); + queue = nullptr; + } + void set_saved_queue(sycl::queue* q) { + std::lock_guard lock(m_mutex); + _saved_queue = q; + } + sycl::queue *get_saved_queue() const { + std::lock_guard lock(m_mutex); + return _saved_queue; + } + sycl::context get_context() const { return _ctx; } + +private: + void clear_queues() { + _queues.clear(); + _q_in_order = _q_out_of_order = _saved_queue = nullptr; + } + + void init_queues() { + _q_in_order = create_queue_impl(true, sycl::property::queue::in_order()); + _q_out_of_order = create_queue_impl(true); + _saved_queue = &default_queue(); + } + + /// Caller should acquire resource \p m_mutex before calling this function. + template + sycl::queue *create_queue_impl(bool enable_exception_handler, + Properties... properties) { + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; + } + _queues.push_back(std::make_shared( + _ctx, *this, eh, + sycl::property_list( +#ifdef DPCT_PROFILING_ENABLED + sycl::property::queue::enable_profiling(), +#endif + properties...))); + + return _queues.back().get(); + } + + void get_version(int &major, int &minor) const { + detail::get_version(*this, major, minor); + } + sycl::queue *_q_in_order, *_q_out_of_order; + sycl::queue *_saved_queue; + sycl::context _ctx; + std::vector> _queues; + mutable mutex_type m_mutex; +}; + +static inline unsigned int get_tid() { +#if defined(__linux__) + return syscall(SYS_gettid); +#elif defined(_WIN64) + return GetCurrentThreadId(); +#else +#error "Only support Windows and Linux." +#endif +} + +/// device manager +class dev_mgr { +public: + device_ext ¤t_device() { + unsigned int dev_id=current_device_id(); + check_id(dev_id); + return *_devs[dev_id]; + } + device_ext &cpu_device() const { + std::lock_guard lock(m_mutex); + if (_cpu_device == -1) { + throw std::runtime_error("no valid cpu device"); + } else { + return *_devs[_cpu_device]; + } + } + device_ext &get_device(unsigned int id) const { + std::lock_guard lock(m_mutex); + check_id(id); + return *_devs[id]; + } + unsigned int current_device_id() const { + std::lock_guard lock(m_mutex); + auto it=_thread2dev_map.find(get_tid()); + if(it != _thread2dev_map.end()) + return it->second; + return DEFAULT_DEVICE_ID; + } + +/// Select device with a device ID. +/// \param [in] id The id of the device which can +/// be obtained through get_device_id(const sycl::device). + void select_device(unsigned int id) { + std::lock_guard lock(m_mutex); + check_id(id); + _thread2dev_map[get_tid()]=id; + } + unsigned int device_count() { return _devs.size(); } + + unsigned int get_device_id(const sycl::device &dev) { + unsigned int id = 0; + for(auto dev_item : _devs) { + if (*dev_item == dev) { + break; + } + id++; + } + return id; + } + + template + std::enable_if_t< + std::is_invocable_r_v> + select_device(const DeviceSelector &selector = sycl::gpu_selector_v) { + sycl::device selected_device = sycl::device(selector); + unsigned int selected_device_id = get_device_id(selected_device); + select_device(selected_device_id); + } + + /// Returns the instance of device manager singleton. + static dev_mgr &instance() { + static dev_mgr d_m; + return d_m; + } + dev_mgr(const dev_mgr &) = delete; + dev_mgr &operator=(const dev_mgr &) = delete; + dev_mgr(dev_mgr &&) = delete; + dev_mgr &operator=(dev_mgr &&) = delete; + +private: + mutable std::recursive_mutex m_mutex; + dev_mgr() { + sycl::device default_device = + sycl::device(sycl::default_selector_v); + _devs.push_back(std::make_shared(default_device)); + + std::vector sycl_all_devs = + sycl::device::get_devices(sycl::info::device_type::all); + // Collect other devices except for the default device. + if (default_device.is_cpu()) + _cpu_device = 0; + for (auto &dev : sycl_all_devs) { + if (dev == default_device) { + continue; + } + _devs.push_back(std::make_shared(dev)); + if (_cpu_device == -1 && dev.is_cpu()) { + _cpu_device = _devs.size() - 1; + } + } + } + void check_id(unsigned int id) const { + if (id >= _devs.size()) { + throw std::runtime_error("invalid device id"); + } + } + std::vector> _devs; + /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current + /// thread id in _thread2dev_map, which means default device should be used + /// for the current thread. + const unsigned int DEFAULT_DEVICE_ID = 0; + /// thread-id to device-id map. + std::map _thread2dev_map; + int _cpu_device = -1; +}; + +/// Util function to get the default queue of current selected device depends on +/// the USM config. Return the default out-of-ordered queue when USM-none is +/// enabled, otherwise return the default in-ordered queue. +static inline sycl::queue &get_default_queue() { + return dev_mgr::instance().current_device().default_queue(); +} + +/// Util function to get the default in-ordered queue of current device in +/// dpct device manager. +static inline sycl::queue &get_in_order_queue() { + return dev_mgr::instance().current_device().in_order_queue(); +} + +/// Util function to get the default out-of-ordered queue of current device in +/// dpct device manager. +static inline sycl::queue &get_out_of_order_queue() { + return dev_mgr::instance().current_device().out_of_order_queue(); +} + +/// Util function to get the id of current device in +/// dpct device manager. +static inline unsigned int get_current_device_id() { + return dev_mgr::instance().current_device_id(); +} + +/// Util function to get the current device. +static inline device_ext &get_current_device() { + return dev_mgr::instance().current_device(); +} + +/// Util function to get a device by id. +static inline device_ext &get_device(unsigned int id) { + return dev_mgr::instance().get_device(id); +} + +/// Util function to get the context of the default queue of current +/// device in dpct device manager. +static inline sycl::context get_default_context() { + return dpct::get_current_device().get_context(); +} + +/// Util function to get a CPU device. +static inline device_ext &cpu_device() { + return dev_mgr::instance().cpu_device(); +} + +static inline unsigned int select_device(unsigned int id) { + dev_mgr::instance().select_device(id); + return id; +} + +template +static inline std::enable_if_t< + std::is_invocable_r_v> +select_device(const DeviceSelector &selector = sycl::gpu_selector_v) { + dev_mgr::instance().select_device(selector); +} + +static inline unsigned int get_device_id(const sycl::device &dev){ + return dev_mgr::instance().get_device_id(dev); +} + +/// Util function to check whether a device supports some kinds of sycl::aspect. +inline void +has_capability_or_fail(const sycl::device &dev, + const std::initializer_list &props) { + for (const auto &it : props) { + if (dev.has(it)) + continue; + switch (it) { + case sycl::aspect::fp64: + throw std::runtime_error("'double' is not supported in '" + + dev.get_info() + + "' device"); + break; + case sycl::aspect::fp16: + throw std::runtime_error("'half' is not supported in '" + + dev.get_info() + + "' device"); + break; + default: +#define __SYCL_ASPECT(ASPECT, ID) \ + case sycl::aspect::ASPECT: \ + return #ASPECT; +#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) +#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) + auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string { + switch (AspectNum) { +#include +#include + default: + return "unknown aspect"; + } + }; +#undef __SYCL_ASPECT_DEPRECATED_ALIAS +#undef __SYCL_ASPECT_DEPRECATED +#undef __SYCL_ASPECT + throw std::runtime_error( + "'" + getAspectNameStr(it) + "' is not supported in '" + + dev.get_info() + "' device"); + } + break; + } +} +} // namespace dpct + +#endif // __DPCT_DEVICE_HPP__ diff --git a/dpct/dnnl_utils.hpp b/dpct/dnnl_utils.hpp new file mode 100644 index 0000000000000..caf5a768b77e2 --- /dev/null +++ b/dpct/dnnl_utils.hpp @@ -0,0 +1,4921 @@ +//==---- dnnl_utils.hpp ---------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_DNNL_UTILS_HPP__ +#define __DPCT_DNNL_UTILS_HPP__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memory.hpp" +#include "device.hpp" +#include "lib_common_utils.hpp" + +namespace dpct { +namespace dnnl { +/// Get concatenated library version as an integer. +static inline size_t get_version() { + const ::dnnl::version_t *ver = ::dnnl::version(); + return ver->major * 1000 + ver->minor * 100 + ver->patch; +} +class engine_ext; +typedef oneapi::mkl::rng::philox4x32x10 rng_engine_t; +/// An enum class representing memory layout. Used by +/// memory_desc_ext to create a memory with pre-defined layout. +enum class memory_format_tag { nchw, nhwc, nchw_blocked }; + +/// An enum class representing RNN data memory layout. Used by +/// memory_desc_ext to create a memory with pre-defined layout. +enum class rnn_memory_format_tag { tnc, ntc }; + +/// A class holding the description of an N-dimensions memory. +class memory_desc_ext { + ::dnnl::memory::desc _desc; +public: + /// Convert dpct::library_data_t to dnnl::memory::data_type. + static ::dnnl::memory::data_type to_dnnl_data_type(dpct::library_data_t dt); + /// Convert dnnl::memory::data_type to dpct::library_data_t. + static dpct::library_data_t + to_dpct_library_data_t(::dnnl::memory::data_type dt, unsigned block_size); + /// Convert dpct::dnnl::memory_format_tag to dnnl::memory::format_tag. + static ::dnnl::memory::format_tag to_dnnl_format_tag(dpct::library_data_t dt, + memory_format_tag tag); + memory_desc_ext() = default; + memory_desc_ext(::dnnl::memory::desc &desc) : _desc(desc) {} + memory_desc_ext(::dnnl::memory::desc &&desc) : _desc(std::move(desc)) {} + /// Setting a 4D memory with given parameters. + /// \param [in] tag Format tag. + /// \param [in] dt Data type. + /// \param [in] n Number of images. + /// \param [in] c Number of channels. + /// \param [in] h Height of images. + /// \param [in] w Width of images. + void set(memory_format_tag tag, dpct::library_data_t dt, int n, int c, int h, + int w); + /// Setting a 3D RNN data memory with given parameters. + /// \param [in] tag RNN data format tag. + /// \param [in] dt Data type. + /// \param [in] t Number of sequence length. + /// \param [in] n Number of batch. + /// \param [in] c Height of input channel. + void set(rnn_memory_format_tag tag, dpct::library_data_t dt, int t, int n, int c); + /// Setting a 4D memory with given parameters. + /// \param [in] dt Data type. + /// \param [in] n Number of images. + /// \param [in] c Number of channels. + /// \param [in] h Height of images. + /// \param [in] w Width of images. + /// \param [in] n_stride Stride between two continuous images. + /// \param [in] c_stride Stride between two continuous channels. + /// \param [in] h_stride Stride between two continuous rows. + /// \param [in] w_stride Stride between two continuous columns. + void set(dpct::library_data_t dt, int n, int c, int h, int w, int n_stride, + int c_stride, int h_stride, int w_stride); + /// Setting a ND memory with given parameters. + /// \param [in] dt Data type. + /// \param [in] ndims Dimension of the memory. + /// \param [in] dims Array of dimension ndims that contain the size of each + /// memory dimension. \param [in] strides Array of dimension ndims that + /// contain the stride of each memory dimension. + void set(dpct::library_data_t dt, int ndims, const int dims[], + const int strides[]); + /// Setting a ND memory with given parameters. + /// \param [in] tag Format tag. + /// \param [in] dt Data type. + /// \param [in] ndims Dimension of the memory. + /// \param [in] dims Array of dimension ndims that contain the size of each + /// memory dimension. + void set(memory_format_tag tag, dpct::library_data_t dt, int ndims, + const int dims[]); + /// Getting a ::dnnl::memory::desc from a memory_desc_ext. + /// \returns The ::dnnl::memory::desc. + const ::dnnl::memory::desc &get_desc() const { return _desc; } + /// Setting holding desc with given dnnl memory descriptor. + void set_desc(::dnnl::memory::desc desc) { _desc = desc; } + /// Getting a size of a memory_desc_ext in bytes. + /// \returns The size. + size_t get_size() const { return _desc.get_size(); } + /// Getting parameters from a 4D memory. + /// \param [out] dt Data type. + /// \param [out] n Number of images. + /// \param [out] c Number of channels. + /// \param [out] h Height of images. + /// \param [out] w Width of images. + /// \param [out] n_stride Stride between two continuous images. + /// \param [out] c_stride Stride between two continuous channels. + /// \param [out] h_stride Stride between two continuous rows. + /// \param [out] w_stride Stride between two continuous columns. + void get(dpct::library_data_t *dt, int *n, int *c, int *h, int *w, + int *n_stride, int *c_stride, int *h_stride, int *w_stride) const; + /// Getting parameters from a 4D memory. + /// \param [out] dt Data type. + /// \param [out] tag Format tag. + /// \param [out] n Number of images. + /// \param [out] c Number of channels. + /// \param [out] h Height of images. + /// \param [out] w Width of images. + void get(dpct::library_data_t *dt, memory_format_tag *tag, int *n, int *c, + int *h, int *w) const; + /// Getting parameters from a 3D RNN data memory. + /// \param [out] dt Data type. + /// \param [out] tag RNN data format tag. + /// \param [out] t Number of sequence length. + /// \param [out] n Number of batch. + /// \param [out] c Height of input channel. + void get(dpct::library_data_t *dt, rnn_memory_format_tag *tag, int *t, int *n, + int *c) const; + /// Getting parameters from a ND memory. + /// \param [in] requested_ndims Requested number of dimensions to get from a + /// given memory descriptor. + /// \param [out] dt Data type. + /// \param [out] ndims Dimension of the memory. + /// \param [out] dims Array of dimension requested_ndims that contain the + /// size of each memory dimension. + /// \param [out] strides Array of dimension requested_ndims that contain the + /// stride of each memory dimension. + void get(int requested_ndims, dpct::library_data_t *dt, int *ndims, + int dims[], int strides[]) const; + /// Getting parameters from a ND memory. + /// \param [in] requested_ndims Requested number of dimensions to get from a + /// given memory descriptor. + /// \param [out] dt Data type. + /// \param [out] tag Format tag. + /// \param [out] ndims Dimension of the memory. + /// \param [out] dims Array of dimension requested_ndims that contain the + /// size of each memory dimension. + void get(int requested_ndims, dpct::library_data_t *dt, + memory_format_tag *tag, int *ndims, int dims[]) const; + /// Getting dims from a ND memory. + /// \return The dims. + std::vector get_dims() const { return _desc.get_dims(); } + /// Getting strides from a ND memory. + /// \return The strides. + std::vector get_strides() const { + return _desc.get_strides(); + } + /// Getting element num from a ND memory. + /// \return The element number. + size_t get_element_num() const { + auto dims = _desc.get_dims(); + if (dims.empty()) { + return 0; + } + size_t result = 1; + for (auto &dim : dims) { + result *= dim; + } + return result; + } + + operator bool() const { + return bool(_desc); + } + + memory_desc_ext &operator=(std::nullptr_t) { + _desc.reset(nullptr); + return *this; + } +}; + +/// A class holding description for an activation operation. +class activation_desc { + ::dnnl::algorithm _alg; + float _alpha; + float _beta; + +public: + /// Setting an activation descriptor with given parameters. + /// \param [in] alg Activation algorithm. + /// \param [in] alpha Value of alpha parameter. + void set(::dnnl::algorithm alg, float alpha) { + _alg = alg; + if(alg == ::dnnl::algorithm::eltwise_clip) { + _alpha = 0; + _beta = alpha; + } else { + _alpha = alpha; + } + } + /// Getting parameters form an activation descriptor. + /// \param [out] alg Activation algorithm. + /// \param [out] alpha Value of alpha parameter. + void get(::dnnl::algorithm *alg, float *alpha) const { + *alg = _alg; + if(_alg == ::dnnl::algorithm::eltwise_clip) { + *alpha = _beta; + } else { + *alpha = _alpha; + } + } + /// Setting the alpha parameter of an activation descriptor. + /// \param [in] alpha Value of alpha parameter. + void set_alpha(float alpha) { _alpha = alpha; } + /// Setting the beta parameter of an activation descriptor. + /// \param [in] beta Value of beta parameter. + void set_beta(float beta) { _beta = beta; } + /// Setting the algorithm parameter of an activation descriptor. + /// \param [in] alg Activation algorithm. + void set_algorithm(::dnnl::algorithm alg) { _alg = alg; } + /// Getting the alpha parameter from an activation descriptor. + /// \param [out] alpha Value of alpha parameter. + float get_alpha() const { return _alpha; } + /// Getting the beta parameter from an activation descriptor. + /// \param [out] beta Value of beta parameter. + float get_beta() const { return _beta; } + /// Getting the algorithm parameter from an activation descriptor. + /// \param [out] alg Activation algorithm. + ::dnnl::algorithm get_algorithm() const { return _alg; } +}; + +/// A class holding description for a local response normalization operation. +class lrn_desc { + unsigned int _local_size; + float _alpha; + float _beta; + float _k; + +public: + /// Setting a local response normalization descriptor with given parameters. + /// \param [in] local_size Value of local_size parameter. + /// \param [in] alpha Value of alpha parameter. + /// \param [in] beta Value of beta parameter. + /// \param [in] k Value of k parameter. + void set(unsigned int local_size, float alpha, float beta, float k) { + _local_size = local_size; + _alpha = alpha; + _beta = beta; + _k = k; + } + /// Getting parameters form a local response normalization descriptor. + /// \param [out] local_size Value of local_size parameter. + /// \param [out] alpha Value of alpha parameter. + /// \param [out] beta Value of beta parameter. + /// \param [out] k Value of k parameter. + void get(unsigned int *local_size, float *alpha, float *beta, + float *k) const { + *local_size = _local_size; + *alpha = _alpha; + *beta = _beta; + *k = _k; + } + /// Setting the local size parameter of a local response normalization + /// descriptor. + /// \param [in] local_size Value of local_size parameter. + void set_local_size(unsigned int local_size) { _local_size = local_size; } + /// Setting the alpha parameter of a local response normalization descriptor. + /// \param [in] alpha Value of alpha parameter. + void set_alpha(float alpha) { _alpha = alpha; } + /// Setting the beta parameter of a local response normalization descriptor. + /// \param [in] beta Value of beta parameter. + void set_beta(float beta) { _beta = beta; } + /// Setting the k parameter of a local response normalization descriptor. + /// \param [in] k Value of k parameter. + void set_k(float k) { _k = k; } + /// Getting the local size parameter from a local response normalization + /// descriptor. + /// \param [out] local_size Value of local_size parameter. + unsigned int get_local_size() const { return _local_size; } + /// Getting the alpha parameter from a local response normalization + /// descriptor. + /// \param [out] alpha Value of alpha parameter. + float get_alpha() const { return _alpha; } + /// Getting the beta parameter from a local response normalization descriptor. + /// \param [out] beta Value of beta parameter. + float get_beta() const { return _beta; } + /// Getting the k parameter from a local response normalization descriptor. + /// \param [out] k Value of k parameter. + float get_k() const { return _k; } +}; + +/// An enum class representing softmax algorithm. +enum class softmax_algorithm { normal, log }; +/// An enum class representing softmax mode. +enum class softmax_mode { instance, channel }; + +/// A class holding description for a pooling operation. +class pooling_desc { + ::dnnl::algorithm _alg; + std::vector _stride; + std::vector _kernel; + std::vector _padding; + +public: + /// Setting a 2D pooling descriptor with given parameters. + /// \param [in] alg Pooling algorithm. + /// \param [in] kernel_h Value of height of kernel. + /// \param [in] kernel_w Value of width of kernel. + /// \param [in] padding_h Value of height of padding. + /// \param [in] padding_w Value of width of padding. + /// \param [in] stride_h Value of height of stride. + /// \param [in] stride_w Value of width of stride. + void set(::dnnl::algorithm alg, int kernel_h, int kernel_w, int padding_h, + int padding_w, int stride_h, int stride_w) { + _alg = alg; + _stride = {stride_h, stride_w}; + _kernel = {kernel_h, kernel_w}; + _padding = {padding_h, padding_w}; + } + /// Setting a ND pooling descriptor with given parameters. + /// \param [in] alg Pooling algorithm. + /// \param [in] ndims Dimension of the pooling operation. + /// \param [in] kernel Array of dimension ndims containing the kernel size of + /// each dimension. + /// \param [in] padding Array of dimension ndims containing the padding size of + /// each dimension. + /// \param [in] stride Array of dimension ndims containing the stride size of + /// each dimension. + void set(::dnnl::algorithm alg, int ndims, int kernel[], int padding[], + int stride[]) { + _alg = alg; + _stride = std::vector(stride, stride + ndims); + _kernel = std::vector(kernel, kernel + ndims); + _padding = std::vector(padding, padding + ndims); + } + /// Getting parameters from a 2D pooling descriptor. + /// \param [out] alg Pooling algorithm. + /// \param [out] kernel_h Value of height of kernel. + /// \param [out] kernel_w Value of width of kernel. + /// \param [out] padding_h Value of height of padding. + /// \param [out] padding_w Value of width of padding. + /// \param [out] stride_h Value of height of stride. + /// \param [out] stride_w Value of width of stride. + void get(::dnnl::algorithm *alg, int *kernel_h, int *kernel_w, int *padding_h, + int *padding_w, int *stride_h, int *stride_w) const { + *alg = _alg; + *kernel_h = _kernel[0]; + *kernel_w = _kernel[1]; + *padding_h = _padding[0]; + *padding_w = _padding[1]; + *stride_h = _stride[0]; + *stride_w = _stride[1]; + } + /// Getting parameters from a ND pooling descriptor. + /// \param [in] requested_ndims Requested number of dimensions to get from a + /// given pooling descriptor. + /// \param [out] alg Pooling algorithm. + /// \param [out] ndims Dimension of the pooling operation. + /// \param [out] kernel Array of dimension ndims containing the kernel size of + /// each dimension. + /// \param [out] padding Array of dimension ndims containing the padding size + /// of each dimension. + /// \param [out] stride Array of dimension ndims containing the stride size of + /// each dimension. + void get(int requested_ndims, ::dnnl::algorithm *alg, int *ndims, + int kernel[], int padding[], int stride[]) const { + *alg = _alg; + *ndims = _stride.size(); + for (int i = 0; i < requested_ndims; i++) { + kernel[i] = _kernel[i]; + padding[i] = _padding[i]; + stride[i] = _stride[i]; + } + } + /// Setting the algorithm parameter of a pooling descriptor. + /// \param [in] alg Pooling algorithm. + void set_algorithm(::dnnl::algorithm alg) { _alg = alg; } + /// Setting the stride parameter of a pooling descriptor. + /// \param [in] stride Array of dimension ndims containing the stride size of + /// each dimension. + void set_stride(const std::vector &stride) { _stride = stride; } + /// Setting the kernel parameter of a pooling descriptor. + /// \param [in] kernel Array of dimension ndims containing the kernel size of + /// each dimension. + void set_kernel(const std::vector &kernel) { _kernel = kernel; } + /// Setting the padding parameter of a pooling descriptor. + /// \param [in] padding Array of dimension ndims containing the padding size + /// of each dimension. + void set_padding(const std::vector &padding) { _padding = padding; } + + /// Getting the algorithm parameter from a pooling descriptor. + /// \param [out] alg Pooling algorithm. + ::dnnl::algorithm get_algorithm() const { return _alg; } + /// Getting the stride parameter from a pooling descriptor. + /// \returns Array of dimension ndims containing the stride size of each + /// dimension. + const std::vector &get_stride() const { return _stride; } + /// Getting the kernel parameter from a pooling descriptor. + /// \returns Array of dimension ndims containing the kernel size of each + /// dimension. + const std::vector &get_kernel() const { return _kernel; } + /// Getting the padding parameter from a pooling descriptor. + /// \returns Array of dimension ndims containing the padding size of each + /// dimension. + const std::vector &get_padding() const { return _padding; } + /// Getting the output dimensions of a memory after 2D pooling has been + /// applied. + /// \param [in] desc Input memory descriptor. + /// \param [out] out_n Number of images. + /// \param [out] out_c Number of channels. + /// \param [out] out_h Height of images. + /// \param [out] out_w Width of images. + void get_forward_output_dim(const memory_desc_ext &desc, int *out_n, + int *out_c, int *out_h, int *out_w) const { + auto dims = desc.get_dims(); + *out_n = dims[0]; + *out_c = dims[1]; + *out_h = 1 + (dims[2] + 2 * _padding[0] - _kernel[0]) / _stride[0]; + *out_w = 1 + (dims[3] + 2 * _padding[1] - _kernel[1]) / _stride[1]; + } + /// Getting the output dimensions of a memory after ND pooling has been + /// applied. + /// \param [in] desc Input memory descriptor. + /// \param [out] ndims Dimension of the memory. + /// \param [out] out_dims Array of dimension requested_ndims that contain + /// the size of each memory dimension. + void get_forward_output_dim(const memory_desc_ext &desc, int ndims, + int out_dims[]) const { + assert(ndims >= 4 && "ndims is at least 4."); + auto dims = desc.get_dims(); + out_dims[0] = dims[0]; + out_dims[1] = dims[1]; + for (int i = 2; i < ndims; i++) { + out_dims[i] = + 1 + (dims[i] + 2 * _padding[i - 2] - _kernel[i - 2]) / _stride[i - 2]; + } + } +}; + +/// An enum class representing reduction operations. +enum class reduction_op { + max, + min, + sum, + mul, + mean, + amax, + mul_no_zeros, + norm1, + norm2 +}; + +/// An enum class representing batch normalization mode. +enum class batch_normalization_mode { per_activation, spatial }; + +/// An enum class representing batch normalization operations. +enum class batch_normalization_ops { none, activation, add_activation }; + +/// An enum class representing binary operations. +enum class binary_op { add, sub, mul, div, min, max, sqrt, neg }; + +/// An struct representing convolution algorithm infomation. +struct convolution_algorithm_info { + ::dnnl::algorithm algo = ::dnnl::algorithm::convolution_auto; + int status = 0; +}; + +/// A class holding description for a convolution operation. +class convolution_desc { + std::vector _strides; + std::vector _dilates; + std::vector _paddings; + int _group_count = 1; + ::dnnl::fpmath_mode _math_mode = ::dnnl::fpmath_mode::strict; +public: + /// Setting a group count to be used in the convolution. + /// \param [in] group_count Value of group count. + void set_group_count(int group_count) { _group_count = group_count; } + /// Getting a group count specified in the given convolution descriptor. + /// \returns Value of group count. + int get_group_count() { return _group_count; } + /// Setting floating point math mode to be used in the convolution. + /// \param [in] math_mode Value of math_mode. + void set_math_mode(::dnnl::fpmath_mode math_mode) { _math_mode = math_mode; } + /// Getting floating point math mode specified in the given convolution descriptor. + /// \returns Value of math mode. + ::dnnl::fpmath_mode get_math_mode() { return _math_mode; } + /// Setting a 2D convolution descriptor with given parameters. + /// \param [in] padding_h Value of height of padding. + /// \param [in] padding_w Value of width of padding. + /// \param [in] stride_h Value of height of stride. + /// \param [in] stride_w Value of width of stride. + /// \param [in] dilate_h Value of height of dilate. + /// \param [in] dilate_w Value of width of dilate. + void set(int padding_h, int padding_w, int stride_h, int stride_w, + int dilate_h, int dilate_w) { + _strides = {stride_h, stride_w}; + _dilates = {dilate_h - 1, dilate_w - 1}; + _paddings = {padding_h, padding_w}; + } + /// Setting a ND convolution descriptor with given parameters. + /// \param [in] ndims Dimension of the convolution operation. + /// \param [in] paddings Array of dimension ndims containing the padding size of + /// each dimension. + /// \param [in] strides Array of dimension ndims containing the stride size of + /// each dimension. + /// \param [in] dilates Array of dimension ndims containing the kernel size of + /// each dimension. + void set(int ndims, int paddings[], int strides[], int dilates[]) { + _strides = std::vector(strides, strides + ndims); + _paddings = std::vector(paddings, paddings + ndims); + _dilates = std::vector(dilates, dilates + ndims); + for (auto &dilate : _dilates) { + dilate--; + } + } + /// Getting parameters from a 2D convolution descriptor. + /// \param [out] padding_h Value of height of padding. + /// \param [out] padding_w Value of width of padding. + /// \param [out] stride_h Value of height of stride. + /// \param [out] stride_w Value of width of stride. + /// \param [out] dilate_h Value of height of dilate. + /// \param [out] dilate_w Value of width of dilate. + void get(int *padding_h, int *padding_w, int *stride_h, int *stride_w, + int *dilate_h, int *dilate_w) const { + *dilate_h = _dilates[0]; + *dilate_w = _dilates[1]; + *padding_h = _paddings[0]; + *padding_w = _paddings[1]; + *stride_h = _strides[0]; + *stride_w = _strides[1]; + } + /// Getting parameters from a ND convolution descriptor. + /// \param [in] requested_ndims Requested number of dimensions to get from a + /// given convolution descriptor. + /// \param [out] ndims Dimension of the pooling operation. + /// \param [out] paddings Array of dimension ndims containing the padding size + /// of each dimension. + /// \param [out] strides Array of dimension ndims containing the stride size of + /// each dimension. + /// \param [out] dilates Array of dimension ndims containing the dilate size of + /// each dimension. + void get(int requested_ndims, int *ndims, int paddings[], int strides[], + int dilates[]) const { + *ndims = _strides.size(); + for (int i = 0; i < requested_ndims; i++) { + dilates[i] = _dilates[i]; + paddings[i] = _paddings[i]; + strides[i] = _strides[i]; + } + } + /// Getting the stride parameter from a convolution descriptor. + /// \returns Array of dimension ndims containing the stride size of each + /// dimension. + const std::vector &get_stride() const { return _strides; } + /// Getting the kernel parameter from a convolution descriptor. + /// \returns Array of dimension ndims containing the dilate size of each + /// dimension. + const std::vector &get_dilate() const { return _dilates; } + /// Getting the padding parameter from a convolution descriptor. + /// \returns Array of dimension ndims containing the padding size of each + /// dimension. + const std::vector &get_padding() const { return _paddings; } + /// Getting the output dimensions of a memory after 2D convolution has been + /// applied. + /// \param [in] desc Input memory descriptor. + /// \param [in] weight_desc Input weight memory descriptor. + /// \param [out] out_n Number of images. + /// \param [out] out_c Number of channels. + /// \param [out] out_h Height of images. + /// \param [out] out_w Width of images. + void get_forward_output_dim(const memory_desc_ext &desc, + const memory_desc_ext &weight_desc, int *out_n, + int *out_c, int *out_h, int *out_w) const { + auto dims = desc.get_dims(); + auto weight_dims = weight_desc.get_dims(); + *out_n = dims[0]; + *out_c = weight_dims[0]; + *out_h = 1 + (dims[2] + 2 * _paddings[0] - + (1 + (_dilates[0] * (weight_dims[2] - 1)))) / + _strides[0]; + *out_w = 1 + (dims[3] + 2 * _paddings[1] - + (1 + (_dilates[1] * (weight_dims[3] - 1)))) / + _strides[1]; + } + /// Getting the output dimensions of a memory after ND convolution has been + /// applied. + /// \param [in] desc Input memory descriptor. + /// \param [in] weight_desc Input weight memory descriptor. + /// \param [out] ndims Dimension of the memory. + /// \param [out] out_dims Array of dimension requested_ndims that contain + /// the size of each memory dimension. + void get_forward_output_dim(const memory_desc_ext &desc, + const memory_desc_ext &weight_desc, int ndims, + int out_dims[]) const { + assert(ndims >= 4 && "ndims is at least 4."); + auto dims = desc.get_dims(); + auto weight_dims = weight_desc.get_dims(); + out_dims[0] = dims[0]; + out_dims[1] = weight_dims[1]; + for (int i = 2; i < ndims; i++) { + out_dims[i] = 1 + (dims[i] + 2 * _paddings[i - 2] - + (1 + (_dilates[i - 2] * (weight_dims[i] - 1)))) / + _strides[i - 2]; + } + } + + convolution_desc &operator=(std::nullptr_t) { + return *this = convolution_desc(); + } + + operator bool() const { + return !(_strides.size() == 0 + && _dilates.size() == 0 + && _paddings.size() == 0); + } +}; + +/// An enum class representing rnn mode. +enum class rnn_mode { vanilla_relu, vanilla_tanh, lstm, gru }; + +/// An enum class representing rnn bias mode. +enum class rnn_bias_mode { none, single }; + +/// An enum class representing rnn direction. +enum class rnn_direction {unidirectional, bidirectional}; + +/// A class holding description for a RNN operation. +class rnn_desc { + rnn_mode _mode; + rnn_bias_mode _bias_mode; + rnn_direction _direction; + dpct::library_data_t _dt; + int _input_size; + int _hidden_size; + int _projection_size; + int _layer_size; + +public: + void set(rnn_mode mode, rnn_bias_mode bias_mode, rnn_direction direction, + dpct::library_data_t dt, int input_size, int hidden_size, + int projection_size, int layer_size) { + _mode = mode; + _bias_mode = bias_mode; + _direction = direction; + _input_size = input_size; + _hidden_size = hidden_size; + _projection_size = projection_size; + _layer_size = layer_size; + _dt = dt; + } + void get(rnn_mode *mode, rnn_bias_mode *bias_mode, rnn_direction *direction, + dpct::library_data_t *dt, int *input_size, int *hidden_size, + int *projection_size, int *layer_size) const { + *mode = _mode; + *bias_mode = _bias_mode; + *direction = _direction; + *input_size = _input_size; + *hidden_size = _hidden_size; + *projection_size = _projection_size; + *layer_size = _layer_size; + *dt = _dt; + } +}; + +/// A class holding description for a Dropout operation. +class dropout_desc { + struct dropout_desc_imp { + float _p = 0.5f; + unsigned long long _seed = 1; + void *_state = nullptr; + std::vector _host_state; + rng_engine_t _rng_engine; + dropout_desc_imp() : _rng_engine(dpct::get_default_queue(), 1) {} + }; + std::shared_ptr _imp; + + void generate(sycl::queue *q, std::int64_t required_state_size, + std::int64_t num, void *buffer) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) " + "Interfaces Project does not support this API."); +#else + sycl::event e_gen = oneapi::mkl::rng::generate( + oneapi::mkl::rng::bernoulli(1.f - _imp->_p), + _imp->_rng_engine, num, (std::int32_t *)buffer); + sycl::event e_save = q->submit([&](sycl::handler &cgh) { + cgh.depends_on(e_gen); + cgh.host_task([=] { + oneapi::mkl::rng::save_state(_imp->_rng_engine, + _imp->_host_state.data()); + }); + }); + q->memcpy(_imp->_state, _imp->_host_state.data(), required_state_size, + e_save); +#endif + } +public: + operator bool() const { + return bool(_imp); + } + dropout_desc &operator=(std::nullptr_t) { + _imp.reset(); + return *this; + } + /// Initializing a dropout descriptor. + void init(){ + _imp = std::make_shared(); + } + /// Setting a dropout descriptor with given parameters. + /// \param [in] engine Engine of the dropout operation. + /// \param [in] p Probability of value set to zero. + /// \param [in] state Memory that store random generator state. + /// \param [in] state_size Required size to store random generator state. + /// \param [in] seed Seed to initialize conditions of the generator state. + void set(engine_ext &engine, float p, void *state, size_t state_size, + unsigned long long seed); + /// Getting parameters from a dropout descriptor. + /// \param [in] engine Engine of the dropout operation. + /// \param [in] p Probability of value set to zero. + /// \param [in] state Memory that store random generator state. + /// \param [in] seed Seed to initialize conditions of the generator state. + void get(float *p, void **states, unsigned long long *seed) const noexcept { + *seed = _imp->_seed; + *states = _imp->_state; + *p = _imp->_p; + } + /// Getting the probability of value set to zero. + /// \returns Probability. + float get_probability() const noexcept { return _imp->_p; } + /// Restoreing a dropout descriptor from stored state. + /// \param [in] engine Engine of the dropout operation. + /// \param [in] p Probability of value set to zero. + /// \param [in] state Memory that store random generator state. + /// \param [in] state_size Required size to store random generator state. + /// \param [in] seed Seed to initialize conditions of the generator state. + void restore(engine_ext &engine, float p, void *state, size_t state_size, + unsigned long long seed); + friend class engine_ext; +}; + +namespace detail { +typedef std::string primitive_cache_key_type; +typedef std::list usage_list_type; +struct primitive_cache_value_type { + ::dnnl::primitive *_primitive; + std::unordered_map *_args; + usage_list_type::iterator _usage_it; + std::function _destructor; + sycl::event _e; + sycl::queue _q; + primitive_cache_value_type( + ::dnnl::primitive *primitive, + std::unordered_map *args, + usage_list_type::iterator usage_it, + std::function destructor, sycl::event e, + sycl::queue q) + : _primitive(primitive), _args(args), _usage_it(usage_it), + _destructor(destructor), _e(e), _q(q) {} +}; +struct primitive_and_args { + ::dnnl::primitive *primitive; + std::unordered_map *args; +}; +typedef std::unordered_map> + cache_map_type; + +// The primitive cache uses LRU replacement policy, and the default cache +// capacity is 1024. +class primitive_cache { + int _capacity = 1024; + usage_list_type usage; + cache_map_type cache_map; + void touch(cache_map_type::iterator it, sycl::event e = {}, + bool update_event = false) { + if (it->second->_usage_it != usage.begin()) { + const primitive_cache_key_type &key = it->first; + usage.erase(it->second->_usage_it); + usage.push_front(key); + it->second->_usage_it = usage.begin(); + } + if (update_event) { + it->second->_e = e; + } + } + +public: + std::shared_ptr + get(const primitive_cache_key_type &key) { + auto it = cache_map.find(key); + if (it == cache_map.end()) { + return nullptr; + } + touch(it); + return it->second; + } + void put(const primitive_cache_key_type &key, ::dnnl::primitive *value, + std::unordered_map *args, + std::function destructor, sycl::event e, + sycl::queue *q) { + auto it = cache_map.find(key); + if (it != cache_map.end()) { + touch(it, e, true); + } else { + if (cache_map.size() == _capacity) { + auto v = *(cache_map.find(usage.back())->second); + v._q.submit([=](sycl::handler &cgh) { + cgh.depends_on(v._e); + cgh.host_task([=] { + delete v._args; + v._destructor(v._primitive); + }); + }); + cache_map.erase(usage.back()); + usage.pop_back(); + } + usage.push_front(key); + cache_map[key] = std::make_shared( + value, args, usage.begin(), destructor, e, *q); + } + } +}; +} // namespace detail + +/// A class holding the oneDNN engine. +class engine_ext { + struct output_argument_info { + float _alpha; + float _beta; + int _name; + memory_desc_ext _desc; + void *_data; + output_argument_info(float alpha, float beta, int name, + memory_desc_ext desc, void *data) + : _alpha(alpha), _beta(beta), _name(name), _desc(desc), _data(data) {} + output_argument_info(float alpha, float beta, memory_desc_ext desc, + void *data) + : _alpha(alpha), _beta(beta), _name(0), _desc(desc), _data(data) {} + }; + struct buffer_info { + size_t capacity = 0; + uint8_t *buffer = nullptr; + size_t usage = 0; + sycl::queue q; + sycl::event deps; + size_t primitive_depth = 0; + }; + struct internal_resource { + std::int64_t random_engine_state_size = -1; + buffer_info binfo; + }; + std::shared_ptr<::dnnl::engine> _eng = nullptr; + std::shared_ptr<::dnnl::stream> _s = nullptr; + sycl::queue *_q = nullptr; + unsigned int _engine_id = 0; + static thread_local unsigned int _engine_count; + static thread_local std::map _workspace_map; + static thread_local std::map> + _internal_resource_cache; + static thread_local detail::primitive_cache _primitive_cache; + ::dnnl::memory &get_workspace(void *key) { return _workspace_map[key]; } + void insert_workspace(void *key, ::dnnl::memory workspace) { + _workspace_map[key] = workspace; + } + const ::dnnl::stream &get_stream() const { return *_s; } + const ::dnnl::engine &get_engine() const { return *_eng; } + + void *allocate(const memory_desc_ext &desc, int count = 1); + void *allocate(size_t size); + std::shared_ptr get_internal_resource(sycl::queue *q){ + auto it = _internal_resource_cache.find(_q); + if (it == _internal_resource_cache.end()) { + return _internal_resource_cache[_q] = std::make_shared(); + } + return it->second; + } + void enter_primitive(size_t request_buffer_size = 0) { + auto &info = get_internal_resource(_q)->binfo; + if (info.primitive_depth == 0) { + info.usage = 0; + if (request_buffer_size > info.capacity) { + if (info.buffer && (info.capacity != 0)) { + auto ainfo = info; + ainfo.q.submit([=](sycl::handler &cgh) { + cgh.depends_on(ainfo.deps); + cgh.host_task([=] { sycl::free(ainfo.buffer, ainfo.q); }); + }); + } + size_t new_buffer_capacity = + std::max(request_buffer_size, info.capacity * 2); + info.capacity = new_buffer_capacity; + info.buffer = (uint8_t *)sycl::malloc_device(new_buffer_capacity, *_q); + info.q = *_q; + info.deps = sycl::event(); + } + } + info.primitive_depth++; + } + sycl::event exit_primitive(const sycl::event &e) { + auto &info = get_internal_resource(_q)->binfo; + info.primitive_depth--; + if ((info.primitive_depth == 0) && info.usage) { + info.deps = e; + } + return e; + } + ::dnnl::memory::desc + compress_spatial_dimensions_to_channel(const ::dnnl::memory::desc &desc); + ::dnnl::memory::desc + get_bn_scale_bias_mean_var_desc(const ::dnnl::memory::desc &desc, + batch_normalization_mode mode); + sycl::event batch_normalization_backward_internal( + batch_normalization_mode mode, float epsilon, float alpha_data, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data, + const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param, + const memory_desc_ext &diff_scale_bias_desc, void *scale, void *bias, + float beta_param, void *diff_scale, void *diff_bias, + const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var); + sycl::event batch_normalization_forward_internal( + bool is_infer, batch_normalization_mode mode, float epsilon, float factor, + float alpha, const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &scale_bias_desc, void *scale, void *bias, + const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var, + void *running_mean, void *running_var); + ::dnnl::memory::desc + transfer_memory_desc_to_channel_major_format(const ::dnnl::memory::desc &desc); + ::dnnl::memory::desc + bn_reorder_memory_to_channel_major_format( + bool is_input, ::dnnl::memory::desc &desc, void *src, void **cache); + ::dnnl::memory::desc + transfer_memory_desc_to_format_tag_any(const ::dnnl::memory::desc &desc){ + return ::dnnl::memory::desc(desc.get_dims(), desc.get_data_type(), + ::dnnl::memory::format_tag::any); + } + void allocate_and_reorder_memory_to_optimal(::dnnl::memory::desc &from_desc, + void *&from, + ::dnnl::memory::desc &to_desc, + void *&to) { + if (from_desc != to_desc) { + to = allocate(to_desc); + async_reorder(1.f, from_desc, from, 0.f, to_desc, to); + } + } + template + std::pair + create_primitive_args_or_get(args_type &&...args); + template + typename primitive_type::primitive_desc + get_primitive_desc(::dnnl::primitive *p); + template + typename primitive_type::primitive_desc + create_primitive_desc(args_type &&...args); + template + void generate_cache_key(std::string &key_buffer, const T &arg); + template + void generate_cache_key(std::string &key_buffer, const T &first_arg, + const args_type &...args); + void insert_arg(std::unordered_map *args, int name, + const ::dnnl::memory::desc &desc, void *data) { + auto it = args->find(name); + if (it != args->end()) { + it->second.set_data_handle(data); + } else { + args->insert({name, ::dnnl::memory(desc, *_eng, data)}); + } + } + void insert_arg(std::unordered_map *args, int name, + const ::dnnl::memory &mem) { + (*args)[name] = mem; + } + sycl::event execute_rnn_forward_primitive( + rnn_mode mode, ::dnnl::prop_kind kind, ::dnnl::rnn_direction direction, + rnn_bias_mode bias_mode, ::dnnl::memory::data_type dt, + ::dnnl::memory::format_tag tag, int seq_length, int batch_size, int src_c, + int dst_c, int layer_size, int direction_num, int hidden_size, + int gate_num, int projection_size, std::vector &data, + std::vector &offset, int iter_num, size_t *weight_size = nullptr, + size_t *workspace_size = nullptr, size_t *scratchpad_size = nullptr); + + sycl::event rnn_forward_internal( + const rnn_desc &desc, ::dnnl::prop_kind kind, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter, + const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c, + size_t weight_size, void *weight, size_t workspace_size, void *workspace, + size_t scratchpad_size, void *scratchpad, bool is_get_execution_args, + size_t *weight_size_query, size_t *workspace_size_query, + size_t *scratchpad_size_query); + + sycl::event execute_rnn_backward_primitive( + rnn_mode mode, ::dnnl::rnn_direction direction, rnn_bias_mode bias_mode, + ::dnnl::memory::data_type dt, ::dnnl::memory::format_tag tag, + int seq_length, int batch_size, int src_c, int dst_c, int layer_size, + int direction_num, int hidden_size, int gate_num, int projection_size, + std::vector &data, std::vector &offset, int iter_num); + bool + scale_parameter_preprocess(const std::vector &args); + template + sycl::event + execute_primitive(const std::pair &primitive, + const std::vector &extra_args = {}); + template + sycl::event fill_with_type(sycl::queue *q, void *src, const void *value, + size_t size_with_byte) { + return q->fill(static_cast(src), *static_cast(value), + size_with_byte / sizeof(T)); + } + template struct no_zero_op { + T operator()(T e) { + if (!e) { + return 1; + } + return e; + } + }; + template + void transform_no_zero_with_type(sycl::queue *q, void *src, void *dst, + size_t num) { + std::transform(oneapi::dpl::execution::make_device_policy(*q), + static_cast(src), static_cast(src) + num, + static_cast(dst), no_zero_op()); + } + void transform_no_zero(const memory_desc_ext &desc, void *src, void *dst); + ::dnnl::memory::desc get_group_weight_desc(int group_count, + const memory_desc_ext &weight_desc); + void get_rnn_configuration(const ::dnnl::memory::desc &desc, + rnn_direction direction, rnn_mode mode, + dpct::library_data_t dt, int hidden_size, + ::dnnl::memory::data_type *dnnl_dt, + ::dnnl::memory::format_tag *tag, + int *projection_size, int *output_size, + int *seq_length, int *batch_size, + int *direction_num, int *gate_num); +public: + engine_ext() {} + operator bool() const { + return bool(_eng) && bool(_s) && bool(_q); + } + engine_ext &operator=(std::nullptr_t) { + _eng = nullptr; + _s = nullptr; + _q = nullptr; + return *this; + } + /// Creating oneDNN engine. + void create_engine() { + _q = &dpct::get_current_device().default_queue(); + _eng = std::make_shared<::dnnl::engine>(::dnnl::sycl_interop::make_engine( + dpct::get_current_device(), dpct::get_current_device().get_context())); + _s = std::make_shared<::dnnl::stream>( + ::dnnl::sycl_interop::make_stream(*_eng, *_q)); + _engine_id = _engine_count++; + } + /// Setting the user's SYCL queue for an oneDNN engine. + /// \param [in] q Pointer to the SYCL queue. + void set_queue(sycl::queue *q) { + if (!q) { + throw std::runtime_error("set_queue: pointer must not be nullptr."); + } + if (!_eng) { + throw std::runtime_error("set_queue: current engine is invalid."); + } + if (q->get_context() != ::dnnl::sycl_interop::get_context(*_eng)) { + throw std::runtime_error( + "set_queue: queue is mismatch with current engine context."); + } + _q = q; + _s = std::make_shared<::dnnl::stream>( + ::dnnl::sycl_interop::make_stream(*_eng, *_q)); + } + /// Retrieving the user's SYCL queue set in the oneDNN engine. + /// \returns Pointer to the SYCL queue. + sycl::queue *get_queue() const { return _q; } + /// Setting all elements of a memory to a given value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] valuePtr Pointer to a single value. + void fill(const memory_desc_ext &src_desc, void *src, + const void *valuePtr); + /// Coping the scaled data from a memory to another memory with a different + /// description. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + void reorder(float alpha, const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, void *dst); + /// Scaling all the elements of a memory by a given factor. + /// \param [in] alpha Value to scaling factors. + /// \param [in] src_desc Source memory descriptor. + /// \param [out] src Pointer to source data. + void scale(float alpha, const memory_desc_ext &src_desc, void *src); + /// Adding the scaled values of a memory to another memory. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + void sum(float alpha, const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, void *dst); + /// Computing a specified activation function value. + /// \param [in] desc Activation descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + void activation_forward(activation_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst); + /// Computing the gradient of a specified activation function. + /// \param [in] desc Activation descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the differential destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + void + activation_backward(activation_desc &desc, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src); + /// Computing a specified pooling function value. + /// \param [in] desc Pooling descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [out] workspace Pointer to workspace generated from forward propagation. + void pooling_forward(pooling_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst, ::dnnl::memory *workspace = nullptr); + /// Computing the gradient of a specified pooling function. + /// \param [in] desc Activation descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the differential destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential + /// source data. + /// \param [in] workspace Pointer to workspace used for backward + /// propagation. + void pooling_backward(pooling_desc &desc, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, + void *diff_dst, const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &diff_src_desc, + void *diff_src, + ::dnnl::memory *workspace = nullptr); + /// Computing a specified softmax function value. + /// \param [in] alg Softmax algorithm. + /// \param [in] mode Softmax mode. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + void softmax_forward(softmax_algorithm alg, softmax_mode mode, + float alpha, const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &dst_desc, void *dst); + /// Computing the gradient of a specified softmax function. + /// \param [in] alg Softmax algorithm. + /// \param [in] mode Softmax mode. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the differential destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + void softmax_backward(softmax_algorithm alg, softmax_mode mode, + float alpha, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &diff_dst_desc, + void *diff_dst, float beta, + const memory_desc_ext &diff_src_desc, + void *diff_src); + /// Computing a specified local response normalization function value. + /// \param [in] desc Local response normalization descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [out] workspace Pointer to workspace generated from forward + /// propagation. + void lrn_forward(lrn_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst, ::dnnl::memory *workspace = nullptr); + /// Computing the gradient of a specified local response normalization + /// function. + /// \param [in] desc Local response normalization descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed value. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the differential destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \param [in] workspace Pointer to workspace used for backward propagation. + void lrn_backward(lrn_desc &desc, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &diff_src_desc, + void *diff_src, ::dnnl::memory *workspace = nullptr); + /// Setting all elements of a memory to a given value asynchronously. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] valuePtr Pointer to a single value. + /// \returns An event representing the fill operations. + sycl::event async_fill(const memory_desc_ext &src_desc, void *src, + const void *valuePtr); + /// Coping the scaled data from a memory to another memory with a different + /// description asynchronously. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \returns An event representing the reorder operations. + sycl::event async_reorder(float alpha, const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, void *dst); + /// Scaling all the elements of a memory by a given factor asynchronously. + /// \param [in] alpha Value to scaling factors. + /// \param [in] src_desc Source memory descriptor. + /// \param [out] src Pointer to source data. + /// \returns An event representing the scale operations. + sycl::event async_scale(float alpha, const memory_desc_ext &src_desc, void *src); + /// Adding the scaled values of a memory to another memory asynchronously. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \returns An event representing the sum operations. + sycl::event async_sum(float alpha, const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, void *dst); + + /// Perform specified binary operation asynchronously. + /// \param [in] op Specified binary operation. + /// \param [in] alpha_0 Value to scaling factors used to scale the src_0 + /// value. + /// \param [in] src_desc_0 Source 0 memory descriptor. + /// \param [in] src_0 Pointer to source 0 data. + /// \param [in] alpha_1 Value to scaling factors used to scale the src_1 + /// value. + /// \param [in] src_desc_1 Source 1 memory descriptor. + /// \param [in] src_1 Pointer to source 1 data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \returns An event representing the binary operations. + sycl::event async_binary(binary_op op, float alpha_0, + const memory_desc_ext &src_desc_0, void *src_0, + float alpha_1, const memory_desc_ext &src_desc_1, + void *src_1, float beta, const memory_desc_ext &dst_desc, + void *dst); + + /// Perform specified binary operation asynchronously. + /// \param [in] op Specified reduction operation. + /// \param [in] alpha Value to scaling factors used to scale the data + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \returns An event representing the reduction operations. + sycl::event async_reduction(reduction_op op, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst); + /// Computing a specified activation function value asynchronously. + /// \param [in] desc Activation descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \returns An event representing the activation forward operations. + sycl::event async_activation_forward(activation_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst); + /// Computing the gradient of a specified activation function asynchronously. + /// \param [in] desc Activation descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the differential destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \returns An event representing the activation backward operations. + sycl::event + async_activation_backward(activation_desc &desc, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src); + /// Computing a specified pooling function value asynchronously. + /// \param [in] desc Pooling descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [out] workspace Pointer to workspace generated from forward propagation. + /// \returns An event representing the pooling forward operations. + sycl::event async_pooling_forward(pooling_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst, ::dnnl::memory *workspace = nullptr); + /// Computing the gradient of a specified pooling function asynchronously. + /// \param [in] desc Activation descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the differential destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential + /// source data. + /// \param [in] workspace Pointer to workspace used for backward + /// propagation. + /// \returns An event representing the pooling backward operations. + sycl::event async_pooling_backward(pooling_desc &desc, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, + void *diff_dst, const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &diff_src_desc, + void *diff_src, + ::dnnl::memory *workspace = nullptr); + /// Computing a specified softmax function value asynchronously. + /// \param [in] alg Softmax algorithm. + /// \param [in] mode Softmax mode. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \returns An event representing the softmax forward operations. + sycl::event async_softmax_forward(softmax_algorithm alg, softmax_mode mode, + float alpha, const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &dst_desc, void *dst); + /// Computing the gradient of a specified softmax function asynchronously. + /// \param [in] alg Softmax algorithm. + /// \param [in] mode Softmax mode. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the differential destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \returns An event representing the softmax backward operations. + sycl::event async_softmax_backward(softmax_algorithm alg, softmax_mode mode, + float alpha, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &diff_dst_desc, + void *diff_dst, float beta, + const memory_desc_ext &diff_src_desc, + void *diff_src); + /// Computing a specified local response normalization function value + /// asynchronously. + /// \param [in] desc Local response normalization descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [out] workspace Pointer to workspace generated from forward + /// propagation. + /// \returns An event representing the lrn forward operations. + sycl::event async_lrn_forward(lrn_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst, ::dnnl::memory *workspace = nullptr); + /// Computing the gradient of a specified local response normalization + /// function asynchronously. + /// \param [in] desc Local response normalization descriptor. + /// \param [in] alpha Value to scaling factors used to scale the computed value. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the differential destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \param [in] workspace Pointer to workspace used for backward propagation. + /// \returns An event representing the lrn backward operations. + sycl::event async_lrn_backward(lrn_desc &desc, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &diff_src_desc, + void *diff_src, ::dnnl::memory *workspace = nullptr); + + /// Derives a memory descriptor for the batch normalization scale, bias, mean, + /// variance from the source memory descriptor and batch normalization mode. + /// \param [out] desc Derived memory descriptor. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] mode Batch normalization mode. + static void derive_batch_normalization_memory_desc(memory_desc_ext &desc, + const memory_desc_ext &src_desc, + batch_normalization_mode mode); + + /// Derives a memory descriptor for the batch normalization scale, bias, mean, + /// variance from the source memory descriptor and batch normalization mode. + /// \param [out] scale_bias_desc Derived scale and bias memory descriptor. + /// \param [out] mean_var_desc Derived mean and var memory descriptor. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] mode Batch normalization mode. + static void derive_batch_normalization_memory_desc(memory_desc_ext &scale_bias_desc, + memory_desc_ext &mean_var_desc, + const memory_desc_ext &src_desc, + batch_normalization_mode mode); + + /// Get the size of workspace that needed by batch normalization. The data stored + /// in workspace must be preserved between forward and backward. + /// \param [in] ops Batch normalization operation mode. This mode can set to + /// perform only batch normalization, or batch normalization followed by + /// activation, or batch normalization followed by element-wise addition and + /// activation. + /// \param [in] src_desc Source memory descriptor. + /// \returns Size of workspace. + size_t get_batch_normalization_workspace_size( + batch_normalization_ops ops, const memory_desc_ext &src_desc); + + /// Computing a specified batch normalization inference stage function value + /// asynchronously. + /// \param [in] mode Batch normalization mode. + /// \param [in] epsilon Epsilon value used in computation. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory + /// descriptor. + /// \param [in] scale Pointer to scale data. + /// \param [in] bias Pointer to bias data. + /// \param [in] mean Pointer to mean data. + /// \param [in] var Pointer to variance data. + /// \returns An event representing the batch normalization forward operations. + sycl::event async_batch_normalization_forward_inference( + batch_normalization_mode mode, float epsilon, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias, + void *mean, void *var); + + /// Computing a specified batch normalization inference stage function value + /// asynchronously. + /// \param [in] mode Batch normalization mode. + /// \param [in] ops Batch normalization operation mode. This mode can set to + /// perform only batch normalization, or batch normalization followed by + /// activation, or batch normalization followed by element-wise addition and + /// activation. + /// \param [in] adesc Activation operation descriptor. + /// \param [in] epsilon Epsilon value used in computation. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [in] summand_desc Summand memory descriptor. + /// \param [in] summand Pointer to summand data. + /// \param [in] scale_bias_desc Scale, bias memory descriptor. + /// \param [in] scale Pointer to scale data. + /// \param [in] bias Pointer to bias data. + /// \param [in] mean_var_desc Mean, variance memory descriptor. + /// \param [in] mean Pointer to mean data. + /// \param [in] var Pointer to variance data. + /// \returns An event representing the batch normalization forward operations. + sycl::event async_batch_normalization_forward_inference( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &summand_desc, void *summand, + const memory_desc_ext &scale_bias_desc, void *scale, void *bias, + const memory_desc_ext &mean_var_desc, void *mean, void *var); + + /// Computing a specified batch normalization training stage function value + /// asynchronously. + /// \param [in] mode Batch normalization mode. + /// \param [in] epsilon Epsilon value used in computation. + /// \param [in] factor Factor value used in running mean and variance + /// computation. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory + /// descriptor. + /// \param [in] scale Pointer to scale data. + /// \param [in] bias Pointer to bias data. + /// \param [out] running_mean Pointer to running mean data. + /// \param [out] running_var Pointer to running variance data. + /// \param [out] saved_mean Pointer to optional cache to save mean data. + /// \param [out] saved_var Pointer to optional cache to save variance data. + /// \returns An event representing the batch normalization forward operations. + sycl::event async_batch_normalization_forward_training( + batch_normalization_mode mode, float epsilon, float factor, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias, + void *running_mean, void *running_var, void *saved_mean, void *saved_var); + + /// Computing a specified batch normalization training stage function value + /// asynchronously. + /// \param [in] mode Batch normalization mode. + /// \param [in] ops Batch normalization operation mode. This mode can set to + /// perform only batch normalization, or batch normalization followed by + /// activation, or batch normalization followed by element-wise addition and + /// activation. + /// \param [in] adesc Activation operation descriptor. + /// \param [in] epsilon Epsilon value used in computation. + /// \param [in] factor Factor value used in running mean and variance + /// computation. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [in] summand_desc Summand memory descriptor. + /// \param [in] summand Pointer to summand data. + /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory + /// descriptor. + /// \param [in] scale Pointer to scale data. + /// \param [in] bias Pointer to bias data. + /// \param [out] running_mean Pointer to running mean data. + /// \param [out] running_var Pointer to running variance data. + /// \param [out] saved_mean Pointer to optional cache to save mean data. + /// \param [out] saved_var Pointer to optional cache to save variance data. + /// \param [in] workspace_size Size of workspace. + /// \param [out] workspace Pointer to workspace generated from forward + /// propagation. + /// \returns An event representing the batch normalization forward operations. + sycl::event async_batch_normalization_forward_training( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float factor, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &summand_desc, void *summand, + const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias, + void *running_mean, void *running_var, void *saved_mean, void *saved_var, + size_t workspace_size, void *workspace); + + /// Computing a specified batch normalization training stage function value + /// asynchronously. + /// \param [in] mode Batch normalization mode. + /// \param [in] ops Batch normalization operation mode. This mode can set to + /// perform only batch normalization, or batch normalization followed by + /// activation, or batch normalization followed by element-wise addition and + /// activation. + /// \param [in] adesc Activation operation descriptor. + /// \param [in] epsilon Epsilon value used in computation. + /// \param [in] factor Factor value used in running mean and variance + /// computation. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [in] summand_desc Summand memory descriptor. + /// \param [in] summand Pointer to summand data. + /// \param [in] scale_bias_desc Scale, bias memory descriptor. + /// \param [in] scale Pointer to scale data. + /// \param [in] bias Pointer to bias data. + /// \param [in] mean_var_desc Mean, variance memory descriptor. + /// \param [out] running_mean Pointer to running mean data. + /// \param [out] running_var Pointer to running variance data. + /// \param [out] saved_mean Pointer to optional cache to save mean data. + /// \param [out] saved_var Pointer to optional cache to save variance data. + /// \param [in] workspace_size Size of workspace. + /// \param [out] workspace Pointer to workspace generated from forward + /// propagation. + /// \returns An event representing the batch normalization forward operations. + sycl::event async_batch_normalization_forward_training( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float factor, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &summand_desc, void *summand, + const memory_desc_ext &scale_bias_desc, void *scale, void *bias, + const memory_desc_ext &mean_var_desc, void *running_mean, void *running_var, + void *saved_mean, void *saved_var, size_t workspace_size, void *workspace); + + /// Computing the gradient of a specified batch normalization function asynchronously. + /// \param [in] mode Batch normalization mode. + /// \param [in] epsilon Epsilon value used in computation. + /// \param [in] alpha_data Value to scaling factors used to scale the computed + /// data value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] beta_data Value to scaling factors used to scale the prior value + /// in the data memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \param [in] alpha_param Value to scaling factors used to scale the computed + /// parameter value. + /// \param [in] diff_scale_bias_mean_var_desc Differential scale, bias, mean, + /// variance memory descriptor. + /// \param [in] scale Pointer to scale data. + /// \param [in] beta_param Value to scaling factors used to scale the prior value + /// in the parameter memory. + /// \param [in] diff_scale Pointer to differential scale data. + /// \param [in] diff_bias Pointer to differential bias data. + /// \param [in] saved_mean Pointer to optional cache saved mean data in forward. + /// \param [in] saved_var Pointer to optional cache saved variance data in forward. + /// \returns An event representing the batch normalization backward operations. + sycl::event async_batch_normalization_backward( + batch_normalization_mode mode, float epsilon, float alpha_data, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data, + const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param, + const memory_desc_ext &diff_scale_bias_mean_var_desc, void *scale, + float beta_param, void *diff_scale, void *diff_bias, void *saved_mean, + void *saved_var); + + /// Computing the gradient of a specified batch normalization function + /// asynchronously. + /// \param [in] mode Batch normalization mode. + /// \param [in] ops Batch normalization operation mode. This mode can set to + /// perform only batch normalization, or batch normalization followed by + /// activation, or batch normalization followed by element-wise addition and + /// activation. + /// \param [in] adesc Activation operation descriptor. + /// \param [in] epsilon Epsilon value used in computation. + /// \param [in] alpha_data Value to scaling factors used to scale the computed + /// data value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] beta_data Value to scaling factors used to scale the prior value + /// in the data memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \param [in] diff_summand_desc Differential summand memory descriptor. + /// \param [out] diff_summand Pointer to differential summand data. + /// \param [in] alpha_param Value to scaling factors used to scale the computed + /// parameter value. + /// \param [in] diff_scale_bias_mean_var_desc Differential scale, bias, mean, + /// variance memory descriptor. + /// \param [in] scale Pointer to scale data. + /// \param [in] bias Pointer to bias data. + /// \param [in] beta_param Value to scaling factors used to scale the prior value + /// in the parameter memory. + /// \param [out] diff_scale Pointer to differential scale data. + /// \param [out] diff_bias Pointer to differential bias data. + /// \param [in] saved_mean Pointer to optional cache saved mean data in forward. + /// \param [in] saved_var Pointer to optional cache saved variance data in forward. + /// \param [in] workspace_size Size of workspace. + /// \param [in] workspace Pointer to workspace used for backward propagation. + /// \returns An event representing the batch normalization backward operations. + sycl::event async_batch_normalization_backward( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float alpha_data, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data, + const memory_desc_ext &diff_src_desc, void *diff_src, + const memory_desc_ext &diff_summand_desc, void *diff_summand, + float alpha_param, const memory_desc_ext &diff_scale_bias_mean_var_desc, + void *scale, void *bias, float beta_param, void *diff_scale, + void *diff_bias, void *saved_mean, void *saved_var, + size_t workspace_size, void *workspace); + + /// Computing the gradient of a specified batch normalization function + /// asynchronously. + /// \param [in] mode Batch normalization mode. + /// \param [in] ops Batch normalization operation mode. This mode can set to + /// perform only batch normalization, or batch normalization followed by + /// activation, or batch normalization followed by element-wise addition and + /// activation. + /// \param [in] adesc Activation operation descriptor. + /// \param [in] epsilon Epsilon value used in computation. + /// \param [in] alpha_data Value to scaling factors used to scale the computed + /// data value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] beta_data Value to scaling factors used to scale the prior value + /// in the data memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \param [in] diff_summand_desc Differential summand memory descriptor. + /// \param [out] diff_summand Pointer to differential summand data. + /// \param [in] alpha_param Value to scaling factors used to scale the computed + /// parameter value. + /// \param [in] diff_scale_bias_desc Differential scale, bias memory descriptor. + /// \param [in] scale Pointer to scale data. + /// \param [in] bias Pointer to bias data. + /// \param [in] beta_param Value to scaling factors used to scale the prior value + /// in the parameter memory. + /// \param [out] diff_scale Pointer to differential scale data. + /// \param [out] diff_bias Pointer to differential bias data. + /// \param [in] mean_var_desc Differential mean, variance memory descriptor. + /// \param [in] saved_mean Pointer to optional cache saved mean data in forward. + /// \param [in] saved_var Pointer to optional cache saved variance data in forward. + /// \param [in] workspace_size Size of workspace. + /// \param [in] workspace Pointer to workspace used for backward propagation. + /// \returns An event representing the batch normalization backward operations. + sycl::event async_batch_normalization_backward( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float alpha_data, + const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst, + float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src, + const memory_desc_ext &diff_summand_desc, void *diff_summand, + float alpha_param, const memory_desc_ext &diff_scale_bias_desc, void *scale, + void *bias, float beta_param, void *diff_scale, void *diff_bias, + const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var, + size_t workspace_size, void *workspace); + + /// Computing a specified convolution function value asynchronously. + /// \param [in] desc Convolution descriptor. + /// \param [in] alg Convolution algorithm. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] weight_desc Weight memory descriptor. + /// \param [in] weight Pointer to weight data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \returns An event representing the convolution forward operations. + sycl::event async_convolution_forward(convolution_desc &desc, ::dnnl::algorithm alg, + float alpha, const memory_desc_ext &src_desc, + void *src, const memory_desc_ext &weight_desc, + void *weight, float beta, + const memory_desc_ext &dst_desc, void *dst); + + /// Computing a specified convolution function value asynchronously. + /// \param [in] desc Convolution descriptor. + /// \param [in] alg Convolution algorithm. + /// \param [in] adesc Activation operation descriptor. + /// \param [in] alpha_0 Value to scaling factors used to scale the data + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] weight_desc Weight memory descriptor. + /// \param [in] weight Pointer to weight data. + /// \param [in] alpha_1 Value to scaling factors used to scale the summand + /// value. + /// \param [in] summand_desc Summand memory descriptor. + /// \param [in] summand Pointer to summand data. + /// \param [in] bias_desc Bias memory descriptor. + /// \param [in] bias Pointer to bias data. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \returns An event representing the convolution forward operations. + sycl::event async_convolution_forward( + convolution_desc &desc, ::dnnl::algorithm alg, activation_desc &adesc, + float alpha_0, const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &weight_desc, void *weight, float alpha_1, + const memory_desc_ext &summand_desc, void *summand, + const memory_desc_ext &bias_desc, void *bias, + const memory_desc_ext &dst_desc, void *dst); + + /// Computing the data gradient of a specified convolution function asynchronously. + /// \param [in] desc Convolution descriptor. + /// \param [in] alg Convolution algorithm. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] weight_desc Weight memory descriptor. + /// \param [in] weight Pointer to weight data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \returns An event representing the convolution backward data operations. + sycl::event async_convolution_backward_data( + convolution_desc &desc, ::dnnl::algorithm alg, float alpha, + const memory_desc_ext &weight_desc, void *weight, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src); + + /// Computing the weight gradient of a specified convolution function + /// asynchronously. + /// \param [in] desc Convolution descriptor. + /// \param [in] alg Convolution algorithm. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] diff_weight_desc Differential weight memory descriptor. + /// \param [out] diff_weight Pointer to differential weight data. + /// \returns An event representing the convolution backward weight operations. + sycl::event async_convolution_backward_weight( + convolution_desc &desc, ::dnnl::algorithm alg, float alpha, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta, + const memory_desc_ext &diff_weight_desc, void *diff_weight); + + /// Computing the bias gradient of a specified convolution function + /// asynchronously. + /// \param [in] alpha Value to scaling factors used to scale the computed + /// value. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] beta Value to scaling factors used to scale the prior value + /// in the destination memory. + /// \param [in] diff_bias_desc Differential bias memory descriptor. + /// \param [out] diff_bias Pointer to differential bias data. + /// \returns An event representing the convolution backward bias operations. + sycl::event async_convolution_backward_bias(float alpha, + const memory_desc_ext &diff_dst_desc, + void *diff_dst, float beta, + const memory_desc_ext &diff_bias_desc, + void *diff_bias); + + /// Getting the required weight space size for specified rnn operation. + /// \param [in] desc RNN descriptor. + /// \param [out] weight_space_size Size of required weight space. + void rnn_get_weight_space_size(const rnn_desc &desc, + size_t *weight_space_size); + + /// Getting the required scratchpad size and workspace size for specified rnn operation. + /// \param [in] desc RNN descriptor. + /// \param [in] kind Propagation kind. + /// \param [in] src_desc Source memory descriptor. + /// \param [out] scratchpad_size Size of required scratchpad. + /// \param [out] workspace_size Size of required workspace. + void rnn_get_scratchpad_workspace_size(const rnn_desc &desc, ::dnnl::prop_kind kind, + const memory_desc_ext &src_desc, + size_t *scratchpad_size, size_t *workspace_size); + + /// Computing a specified rnn function value asynchronously. + /// \param [in] desc RNN descriptor. + /// \param [in] kind Propagation kind. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [in] iter_desc Recurrent hidden state data memory descriptor. + /// \param [in] src_iter Pointer to input recurrent hidden state data. + /// \param [in] dst_iter Pointer to output recurrent hidden state data. + /// \param [in] iter_c_desc Recurrent cell state data memory descriptor. + /// \param [in] src_c_iter Pointer to input recurrent cell state data. + /// \param [in] dst_c_iter Pointer to output recurrent cell state data. + /// \param [in] weight_size Size of weight memory. + /// \param [in] weight Pointer to weight data. + /// \param [in] scratchpad_size Size of scratchpad memory. + /// \param [in] scratchpad Pointer to scratchpad data. + /// \param [in] workspace_size Size of workspace memory. + /// \param [in] workspace Pointer to workspace data. + /// \returns An event representing the status of rnn forward operations. + sycl::event async_rnn_forward(const rnn_desc &desc, ::dnnl::prop_kind kind, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &iter_desc, void *src_iter, + void *dst_iter, + const memory_desc_ext &iter_c_desc, + void *src_iter_c, void *dst_iter_c, + size_t weight_size, void *weight, + size_t scratchpad_size, void *scratchpad, + size_t workspace_size, void *workspace); + + /// Computing the data and weight gradient of a specified rnn function + /// asynchronously. + /// \param [in] desc RNN descriptor. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [in] dst Pointer to destination data. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [out] diff_src Pointer to differential source data. + /// \param [in] iter_desc Recurrent hidden state data memory descriptor. + /// \param [in] src_iter Pointer to input recurrent hidden state data. + /// \param [in] diff_dst_iter Pointer to differential output recurrent hidden state data. + /// \param [out] diff_src_iter Pointer to differential input recurrent hidden state data. + /// \param [in] iter_c_desc Recurrent cell state data memory descriptor. + /// \param [in] src_c_iter Pointer to input recurrent cell state data. + /// \param [in] diff_dst_c_iter Pointer to differential output recurrent cell state data. + /// \param [out] diff_src_c_iter Pointer to differential input recurrent cell state data. + /// \param [in] weight_size Size of weight memory. + /// \param [in] weight Pointer to weight data. + /// \param [out] diff_weight Pointer to differential weight data. + /// \param [in] scratchpad_size Size of scratchpad memory. + /// \param [in] scratchpad Pointer to scratchpad data. + /// \param [in] workspace_size Size of workspace memory. + /// \param [in] workspace Pointer to workspace data. + /// \returns An event representing the status of rnn backward operations. + sycl::event async_rnn_backward( + const rnn_desc &desc, const memory_desc_ext &dst_desc, void *dst, + void *diff_dst, const memory_desc_ext &src_desc, void *src, + void *diff_src, const memory_desc_ext &iter_desc, void *src_iter, + void *diff_dst_iter, void *diff_src_iter, + const memory_desc_ext &iter_c_desc, void *src_iter_c, + void *diff_dst_iter_c, void *diff_src_iter_c, size_t weight_size, + void *weight, void *diff_weight, size_t scratchpad_size, void *scratchpad, + size_t workspace_size, void *workspace); + + /// Getting the required state size for specified dropout operation. + /// \param [in] src_desc Source memory descriptor. + /// \returns Required size of state. + size_t get_dropout_state_size(); + + /// Getting the required workspace size for dropout operation. + /// \param [in] src_desc Source memory descriptor. + /// \returns Required size of workspace. + static size_t get_dropout_workspace_size(const memory_desc_ext &src_desc); + + /// Computing a specified dropout function value asynchronously. + /// \param [in] desc Dropout descriptor. + /// \param [in] src_desc Source memory descriptor. + /// \param [in] src Pointer to source data. + /// \param [in] dst_desc Destination memory descriptor. + /// \param [out] dst Pointer to destination data. + /// \param [in] workspace Pointer to workspace data. + /// \param [in] workspace_size Size of workspace memory. + /// \returns An event representing the dropout forward operations. + sycl::event async_dropout_forward(dropout_desc &desc, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &dst_desc, void *dst, + void *workspace, size_t workspace_size); + + /// Computing the gradient of a specified dropout function asynchronously. + /// \param [in] desc Dropout descriptor. + /// \param [in] diff_dst_desc Differential destination memory descriptor. + /// \param [in] diff_dst Pointer to differential destination data. + /// \param [in] diff_src_desc Differential source memory descriptor. + /// \param [out] diff_src Pointer to differential source data. + /// \param [in] workspace Pointer to workspace data. + /// \param [in] workspace_size Size of workspace memory. + /// \returns An event representing the dropout backward operations. + sycl::event async_dropout_backward(dropout_desc &desc, + const memory_desc_ext &diff_dst_desc, + void *diff_dst, + const memory_desc_ext &diff_src_desc, + void *diff_src, void *workspace, + size_t workspace_size); +}; + +inline thread_local unsigned int engine_ext::_engine_count; +inline thread_local detail::primitive_cache engine_ext::_primitive_cache; +inline thread_local std::map engine_ext::_workspace_map; +inline thread_local std::map> + engine_ext::_internal_resource_cache; + +inline +void dropout_desc::restore(engine_ext &engine, float p, void *state, + size_t state_size, unsigned long long seed) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) " + "Interfaces Project does not support this API."); +#else + if (state) { + std::int64_t required_state_size = engine.get_dropout_state_size(); + if (state_size < required_state_size) { + throw std::runtime_error("restore: state_size less than required state size."); + } + sycl::queue *q = engine.get_queue(); + _imp->_p = p; + _imp->_seed = seed; + _imp->_state = state; + _imp->_host_state = std::vector(required_state_size); + q->memcpy(_imp->_host_state.data(), _imp->_state, required_state_size).wait(); + _imp->_rng_engine = + oneapi::mkl::rng::load_state( + *q, _imp->_host_state.data()); + } +#endif +} + +inline +void dropout_desc::set(engine_ext &engine, float p, void *state, + size_t state_size, unsigned long long seed) { +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) " + "Interfaces Project does not support this API."); +#else + _imp->_p = p; + if (state) { + std::int64_t required_state_size = engine.get_dropout_state_size(); + if (state_size < required_state_size) { + throw std::runtime_error("set: no sufficient memory to save states."); + } + sycl::queue *q = engine.get_queue(); + _imp->_seed = seed; + _imp->_state = state; + _imp->_host_state = std::vector(required_state_size); + _imp->_rng_engine = rng_engine_t(*q, seed); + oneapi::mkl::rng::save_state(_imp->_rng_engine, _imp->_host_state.data()); + q->memcpy(_imp->_state, _imp->_host_state.data(), required_state_size).wait(); + } +#endif +} + +inline +::dnnl::memory::data_type +memory_desc_ext::to_dnnl_data_type(dpct::library_data_t dt) { + using dnnl_dt = ::dnnl::memory::data_type; + switch (dt) { + case dpct::library_data_t::real_half: + return dnnl_dt::f16; + case dpct::library_data_t::real_bfloat16: + return dnnl_dt::bf16; + case dpct::library_data_t::real_float: + return dnnl_dt::f32; + case dpct::library_data_t::real_double: + return dnnl_dt::f64; + case dpct::library_data_t::real_int32: + return dnnl_dt::s32; + case dpct::library_data_t::real_int8: + return dnnl_dt::s8; + case dpct::library_data_t::real_uint8: + return dnnl_dt::u8; + case dpct::library_data_t::real_int8_4: + return dnnl_dt::s8; + case dpct::library_data_t::real_int8_32: + return dnnl_dt::s8; + case dpct::library_data_t::real_uint8_4: + return dnnl_dt::u8; + default: + throw std::runtime_error("to_dnnl_data_type: unsupported data type."); + } +} + +inline +dpct::library_data_t +memory_desc_ext::to_dpct_library_data_t(::dnnl::memory::data_type dt, + unsigned block_size) { + using dpct_dt = dpct::library_data_t; + using dnnl_dt = ::dnnl::memory::data_type; + switch (dt) { + case dnnl_dt::f16: + return dpct_dt::real_half; + case dnnl_dt::bf16: + return dpct_dt::real_bfloat16; + case dnnl_dt::f32: + return dpct_dt::real_float; + case dnnl_dt::f64: + return dpct_dt::real_double; + case dnnl_dt::s32: + return dpct_dt::real_int32; + case dnnl_dt::s8: + if (block_size == 4) { + return dpct_dt::real_int8_4; + } else if (block_size == 32) { + return dpct_dt::real_int8_32; + } else { + return dpct_dt::real_int8; + } + case dnnl_dt::u8: + if (block_size == 4) { + return dpct_dt::real_uint8_4; + } else { + return dpct_dt::real_uint8; + } + default: + throw std::runtime_error("to_dpct_library_data_t: unsupported data type " + "dnnl::memory::data_type::undef."); + } +} + +inline +::dnnl::memory::format_tag +memory_desc_ext::to_dnnl_format_tag(dpct::library_data_t dt, + memory_format_tag tag) { + using dpct_dt = dpct::library_data_t; + using dpct_tag = memory_format_tag; + using dnnl_tag = ::dnnl::memory::format_tag; + switch (tag) { + case dpct_tag::nchw: + return dnnl_tag::nchw; + case dpct_tag::nhwc: + return dnnl_tag::nhwc; + default: + if (dt == dpct_dt::real_int8_32) { + return dnnl_tag::nChw32c; + } else { + return dnnl_tag::nChw4c; + } + } +} + +inline +void memory_desc_ext::set(memory_format_tag tag, dpct::library_data_t dt, int n, + int c, int h, int w) { + _desc = ::dnnl::memory::desc({n, c, h, w}, to_dnnl_data_type(dt), + to_dnnl_format_tag(dt, tag)); +} + +inline +void memory_desc_ext::set(dpct::library_data_t dt, int n, int c, int h, int w, + int n_stride, int c_stride, int h_stride, + int w_stride) { + _desc = ::dnnl::memory::desc({n, c, h, w}, to_dnnl_data_type(dt), + {n_stride, c_stride, h_stride, w_stride}); +} + +inline +void memory_desc_ext::set(dpct::library_data_t dt, int ndims, const int dims[], + const int strides[]) { + _desc = ::dnnl::memory::desc({dims, dims + ndims}, to_dnnl_data_type(dt), + {strides, strides + ndims}); +} + +inline +void memory_desc_ext::set(memory_format_tag tag, dpct::library_data_t dt, + int ndims, const int dims[]) { + _desc = ::dnnl::memory::desc({dims, dims + ndims}, to_dnnl_data_type(dt), + to_dnnl_format_tag(dt, tag)); +} + +inline +void memory_desc_ext::set(rnn_memory_format_tag tag, dpct::library_data_t dt, + int t, int n, int c) { + if (tag == rnn_memory_format_tag::tnc) { + _desc = ::dnnl::memory::desc({t, n, c}, to_dnnl_data_type(dt), + ::dnnl::memory::format_tag::tnc); + } else if(tag == rnn_memory_format_tag::ntc) { + _desc = ::dnnl::memory::desc({t, n, c}, to_dnnl_data_type(dt), + ::dnnl::memory::format_tag::ntc); + } else { + throw std::runtime_error("set: unsupported memory format tag."); + } +} + +inline +void memory_desc_ext::get(dpct::library_data_t *dt, int *n, int *c, int *h, + int *w, int *n_stride, int *c_stride, int *h_stride, + int *w_stride) const { + unsigned block_size = 1; + auto dims = _desc.get_dims(); + auto inner_blks = _desc.get_inner_blks(); + auto strides = _desc.get_strides(); + if (!inner_blks.empty()) { + block_size = inner_blks[0]; + } + + *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size); + *n = dims[0]; + *c = dims[1]; + *h = dims[2]; + *w = dims[3]; + *n_stride = strides[0] / block_size; + *c_stride = strides[1] / block_size; + *h_stride = strides[2] / block_size; + *w_stride = strides[3] / block_size; +} + +inline +void memory_desc_ext::get(dpct::library_data_t *dt, memory_format_tag *tag, + int *n, int *c, int *h, int *w) const { + unsigned block_size = 1; + *tag = memory_format_tag::nchw; + auto dims = _desc.get_dims(); + auto strides = _desc.get_strides(); + auto inner_blks = _desc.get_inner_blks(); + if (!inner_blks.empty()) { + block_size = inner_blks[0]; + *tag = memory_format_tag::nchw_blocked; + } + if (strides[1] == 1 && dims[1] != 1) { + *tag = memory_format_tag::nhwc; + } + *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size); + *n = dims[0]; + *c = dims[1]; + *h = dims[2]; + *w = dims[3]; +} + +inline +void memory_desc_ext::get(dpct::library_data_t *dt, rnn_memory_format_tag *tag, + int *t, int *n, int *c) const { + auto dims = _desc.get_dims(); + auto strides = _desc.get_strides(); + + if (strides[0] >= strides[1]) { + *tag = rnn_memory_format_tag::tnc; + } else { + *tag = rnn_memory_format_tag::ntc; + } + + *dt = to_dpct_library_data_t(_desc.get_data_type(), 1); + *t = dims[0]; + *n = dims[1]; + *c = dims[2]; +} + +inline +void memory_desc_ext::get(int requested_ndims, dpct::library_data_t *dt, + int *ndims, int dims[], int strides[]) const { + unsigned block_size = 1; + auto inner_blks = _desc.get_inner_blks(); + auto adims = _desc.get_dims(); + auto astrides = _desc.get_strides(); + if (!inner_blks.empty()) { + block_size = inner_blks[0]; + } + *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size); + *ndims = _desc.get_ndims(); + for (int index = 0; index < requested_ndims; index++) { + dims[index] = adims[index]; + strides[index] = + astrides[index] / block_size; + } +} + +inline +void memory_desc_ext::get(int requested_ndims, dpct::library_data_t *dt, + memory_format_tag *tag, int *ndims, + int dims[]) const { + unsigned block_size = 1; + *tag = memory_format_tag::nchw; + auto inner_blks = _desc.get_inner_blks(); + auto adims = _desc.get_dims(); + auto astrides = _desc.get_strides(); + if (!inner_blks.empty()) { + block_size = inner_blks[0]; + *tag = memory_format_tag::nchw_blocked; + } + if (astrides[1] == 1 && + adims[1] != 1) { + *tag = memory_format_tag::nhwc; + } + *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size); + *ndims = _desc.get_ndims(); + for (int index = 0; index < requested_ndims; index++) { + dims[index] = adims[index]; + } +} + +inline +void engine_ext::get_rnn_configuration(const ::dnnl::memory::desc &desc, + rnn_direction direction, rnn_mode mode, + dpct::library_data_t dt, int hidden_size, + ::dnnl::memory::data_type *dnnl_dt, + ::dnnl::memory::format_tag *tag, + int *projection_size, int *output_size, + int *seq_length, int *batch_size, + int *direction_num, int *gate_num) { + if (!desc.is_zero()) { + auto dims = desc.get_dims(); + auto strides = desc.get_strides(); + if (strides[0] >= strides[1]) { + *tag = ::dnnl::memory::format_tag::tnc; + *seq_length = dims[0]; + *batch_size = dims[1]; + } else { + *tag = ::dnnl::memory::format_tag::ntc; + *seq_length = dims[1]; + *batch_size = dims[0]; + } + } + if (direction == rnn_direction::bidirectional) { + *direction_num = 2; + } else { + *direction_num = 1; + } + if (mode == rnn_mode::lstm) { + *gate_num = 4; + } else if (mode == rnn_mode::gru) { + *gate_num = 3; + } else { + *gate_num = 1; + } + if (*projection_size != hidden_size) { + *output_size = *projection_size; + } else { + *projection_size = 0; + *output_size = hidden_size; + } + *dnnl_dt = memory_desc_ext::to_dnnl_data_type(dt); +} + +inline +void *engine_ext::allocate(const memory_desc_ext &data_desc, int count) { + return allocate(data_desc.get_size() * count); +} + +inline +void *engine_ext::allocate(size_t size) { + auto &Info = get_internal_resource(_q)->binfo; + uint8_t *result = Info.buffer + Info.usage; + Info.usage += size; + return result; +} + +inline +void engine_ext::transform_no_zero(const memory_desc_ext &desc, void *src, void *dst) { + ::dnnl::memory::data_type dt = desc.get_desc().get_data_type(); + size_t element_num = desc.get_element_num(); + switch (dt) { + case ::dnnl::memory::data_type::f32: + transform_no_zero_with_type(_q, src, dst, element_num); + break; + case ::dnnl::memory::data_type::f16: + transform_no_zero_with_type(_q, src, dst, element_num); + break; + case ::dnnl::memory::data_type::s32: + transform_no_zero_with_type(_q, src, dst, element_num); + break; + case ::dnnl::memory::data_type::s8: + transform_no_zero_with_type(_q, src, dst, element_num); + break; + case ::dnnl::memory::data_type::u8: + transform_no_zero_with_type(_q, src, dst, element_num); + break; + default: + throw std::runtime_error("transform_no_zero: unsupported data type."); + } +} + +inline +::dnnl::memory::desc +engine_ext::get_group_weight_desc(int group_count, + const memory_desc_ext &weight_desc) { + if (group_count == 1) { + return weight_desc.get_desc(); + } + auto help_weight_desc = weight_desc.get_desc(); + int ndims = help_weight_desc.get_ndims(); + if (!help_weight_desc.get_inner_blks().empty()) { + throw std::runtime_error("get_group_weight_desc: group convolution with " + "blocked weight memory unimplemented."); + } + std::vector new_size; + auto old_size = weight_desc.get_dims(); + new_size.push_back(group_count); + new_size.push_back(old_size[0] / group_count); + for (int index = 1; index < old_size.size(); index++) { + new_size.push_back(old_size[index]); + } + std::vector strides = help_weight_desc.get_strides(); + ::dnnl::memory::format_tag tag; + bool is_nhwc = (strides[1] == 1 && old_size[1] != 1); + + if (ndims == 4) { + if (is_nhwc) { + tag = ::dnnl::memory::format_tag::gohwi; + } else { + tag = ::dnnl::memory::format_tag::goihw; + } + } else if (ndims == 5) { + if (is_nhwc) { + tag = ::dnnl::memory::format_tag::godhwi; + } else { + tag = ::dnnl::memory::format_tag::goidhw; + } + } + + help_weight_desc = + ::dnnl::memory::desc(new_size, weight_desc.get_desc().get_data_type(), tag); + return help_weight_desc; +} + +inline +::dnnl::memory::desc engine_ext::compress_spatial_dimensions_to_channel( + const ::dnnl::memory::desc &desc) { + int ndims = desc.get_ndims(); + auto dims = desc.get_dims(); + auto inner_blks = desc.get_inner_blks(); + assert(ndims >= 4 && "ndims is at least 4."); + std::vector compressed_dims(ndims); + compressed_dims[0] = dims[0]; + compressed_dims[1] = dims[1]; + for (int index = 2; index < ndims; index++) { + compressed_dims[1] = compressed_dims[1] * dims[index]; + compressed_dims[index] = 1; + } + if (!inner_blks.empty() && inner_blks[0] == 4) { + return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(), + ::dnnl::memory::format_tag::nChw4c); + } else if (!inner_blks.empty() && inner_blks[0] == 32) { + return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(), + ::dnnl::memory::format_tag::nChw32c); + } + std::vector strides(ndims, 1); + strides[0] = compressed_dims[1]; + + return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(), strides); +} + +inline +::dnnl::memory::desc +engine_ext::get_bn_scale_bias_mean_var_desc(const ::dnnl::memory::desc &desc, + batch_normalization_mode mode) { + int ndims = desc.get_ndims(); + auto dims = desc.get_dims(); + assert(ndims >= 4 && "ndims is at least 4."); + int channel_num = 1; + if (mode == batch_normalization_mode::spatial) { + channel_num = dims[1]; + } else { + for (int index = 1; index < ndims; index++) { + channel_num = channel_num * dims[index]; + } + } + return ::dnnl::memory::desc({channel_num}, desc.get_data_type(), + ::dnnl::memory::format_tag::a); +} + +inline +::dnnl::memory::desc engine_ext::transfer_memory_desc_to_channel_major_format( + const ::dnnl::memory::desc &desc) { + if (!desc.get_inner_blks().empty()) { + return desc; + } + int ndims = desc.get_ndims(); + auto dims = desc.get_dims(); + if (ndims == 4) { + return ::dnnl::memory::desc(dims, desc.get_data_type(), + ::dnnl::memory::format_tag::nchw); + } + return ::dnnl::memory::desc(dims, desc.get_data_type(), + ::dnnl::memory::format_tag::ncdhw); +} + +/// If the alpha = 0 and beta = 1, then the destination (dst = alpha * out + +/// beta * prior_dst) have no change. In this case this function returns true +/// means the operation can exit directly. +inline +bool engine_ext::scale_parameter_preprocess( + const std::vector &args) { + bool direct_exit = true; + for (auto &arg : args) { + if (arg._alpha == 0.f) { + if (arg._beta != 1.f) { + async_scale(arg._beta, arg._desc, arg._data); + } + } else { + direct_exit = false; + } + } + return direct_exit; +} + +inline +void engine_ext::derive_batch_normalization_memory_desc( + memory_desc_ext &scale_bias_desc, memory_desc_ext &mean_var_desc, + const memory_desc_ext &src_desc, batch_normalization_mode mode) { + derive_batch_normalization_memory_desc(scale_bias_desc, src_desc, mode); + derive_batch_normalization_memory_desc(mean_var_desc, src_desc, mode); +} + +inline +void engine_ext::derive_batch_normalization_memory_desc( + memory_desc_ext &desc, const memory_desc_ext &src_desc, + batch_normalization_mode mode) { + int src_ndims = src_desc.get_desc().get_ndims(); + auto inner_blks = src_desc.get_desc().get_inner_blks(); + if (src_desc.get_desc().get_ndims() != 4 || + src_desc.get_desc().get_ndims() != 5) { + throw std::runtime_error("derive_batch_normalization_memory_desc: only 4d " + "and 5d memory descriptor supported."); + } + std::vector dims = src_desc.get_dims(); + dims[0] = 1; + if (mode == batch_normalization_mode::spatial) { + dims[2] = 1; + dims[3] = 1; + if (src_ndims == 5) { + dims[4] = 1; + } + } + auto data_type = src_desc.get_desc().get_data_type(); + if (data_type == ::dnnl::memory::data_type::f16) { + data_type = ::dnnl::memory::data_type::f32; + } + if (!inner_blks.empty() && inner_blks[0] == 4) { + desc.set_desc(::dnnl::memory::desc(dims, data_type, + ::dnnl::memory::format_tag::nChw4c)); + } else if (!inner_blks.empty() && inner_blks[0] == 32) { + desc.set_desc(::dnnl::memory::desc(dims, data_type, + ::dnnl::memory::format_tag::nChw32c)); + } else { + if (src_ndims == 4) { + desc.set_desc(::dnnl::memory::desc(dims, data_type, + ::dnnl::memory::format_tag::nchw)); + } else { + desc.set_desc(::dnnl::memory::desc(dims, data_type, + ::dnnl::memory::format_tag::ncdhw)); + } + } +} + +template +sycl::event engine_ext::execute_primitive( + const std::pair + &primitive, + const std::vector &output_args) { + std::vector caches; + int output_arg_num = output_args.size(); + for (int i = 0; i < output_arg_num; i++) { + if (output_args[i]._beta != 0.f) { + auto cache = allocate(output_args[i]._desc); + caches.push_back(cache); + (*primitive.second.args)[output_args[i]._name].set_data_handle(cache); + } + } + + auto e = ::dnnl::sycl_interop::execute( + *(static_cast(primitive.second.primitive)), *_s, + *primitive.second.args); + _primitive_cache.put( + primitive.first, primitive.second.primitive, primitive.second.args, + [](::dnnl::primitive *p) { delete static_cast(p); }, e, + _q); + int cache_index = 0; + for (int i = 0; i < output_arg_num; i++) { + if (output_args[i]._beta != 0.f) { + e = async_sum(output_args[i]._alpha, output_args[i]._desc, + caches[cache_index++], output_args[i]._beta, + output_args[i]._desc, output_args[i]._data); + } else { + if (output_args[i]._alpha != 1.f) { + e = async_scale(output_args[i]._alpha, output_args[i]._desc, + output_args[i]._data); + } + } + } + return e; +} + +inline +::dnnl::memory::desc engine_ext::bn_reorder_memory_to_channel_major_format( + bool is_input, ::dnnl::memory::desc &desc, void *src, void **cache) { + ::dnnl::memory::desc result; + result = transfer_memory_desc_to_channel_major_format(desc); + if ((result != desc) || !src) { + *cache = allocate(desc); + if (is_input && src) { + async_reorder(1.f, desc, src, 0.f, result, *cache); + } + } + return result; +} + +inline +sycl::event engine_ext::batch_normalization_backward_internal( + batch_normalization_mode mode, float epsilon, float alpha_data, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data, + const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param, + const memory_desc_ext &diff_scale_bias_desc, void *scale, void *bias, + float beta_param, void *diff_scale, void *diff_bias, + const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var) { + if (scale_parameter_preprocess( + {{alpha_data, beta_data, diff_src_desc, diff_src}, + {alpha_param, beta_param, diff_scale_bias_desc, diff_scale}, + {alpha_param, beta_param, diff_scale_bias_desc, diff_bias}})) { + return sycl::event(); + } + + void *reordered_src = nullptr, *reordered_diff_dst = nullptr, + *reordered_diff_src = nullptr, *reordered_scale = nullptr, + *reordered_bias = nullptr, *reordered_diff_scale = nullptr, + *reordered_diff_bias = nullptr, *reordered_saved_mean = nullptr, + *reordered_saved_var = nullptr; + + ::dnnl::memory::desc help_src_desc = src_desc.get_desc(); + ::dnnl::memory::desc help_diff_dst_desc = diff_dst_desc.get_desc(); + ::dnnl::memory::desc help_diff_src_desc = diff_src_desc.get_desc(); + ::dnnl::memory::desc help_diff_scale_bias_desc = + diff_scale_bias_desc.get_desc(); + ::dnnl::memory::desc help_mean_var_desc = mean_var_desc.get_desc(); + ::dnnl::memory::desc actual_diff_src_desc = help_diff_src_desc; + ::dnnl::memory::desc actual_diff_scale_bias_desc = help_diff_scale_bias_desc; + enter_primitive( + help_diff_scale_bias_desc.get_size() * 14 + help_src_desc.get_size() * 2 + + help_diff_dst_desc.get_size() * 7 + help_diff_src_desc.get_size() * 5 + + help_mean_var_desc.get_size() * 13); + if (mode == batch_normalization_mode::per_activation) { + help_src_desc = bn_reorder_memory_to_channel_major_format(true, help_src_desc, src, + &reordered_src); + help_diff_dst_desc = bn_reorder_memory_to_channel_major_format( + true, help_diff_dst_desc, diff_dst, &reordered_diff_dst); + help_diff_src_desc = bn_reorder_memory_to_channel_major_format( + false, help_diff_src_desc, diff_src, &reordered_diff_src); + actual_diff_src_desc = help_diff_src_desc; + help_diff_scale_bias_desc = bn_reorder_memory_to_channel_major_format( + true, help_diff_scale_bias_desc, scale, &reordered_scale); + actual_diff_scale_bias_desc = help_diff_scale_bias_desc; + if (bias) { + bn_reorder_memory_to_channel_major_format(true, help_diff_scale_bias_desc, bias, + &reordered_bias); + } + bn_reorder_memory_to_channel_major_format(false, help_diff_scale_bias_desc, + diff_scale, &reordered_diff_scale); + bn_reorder_memory_to_channel_major_format(false, help_diff_scale_bias_desc, + diff_bias, &reordered_diff_bias); + + help_mean_var_desc = bn_reorder_memory_to_channel_major_format( + true, help_mean_var_desc, saved_mean, &reordered_saved_mean); + bn_reorder_memory_to_channel_major_format(true, help_mean_var_desc, saved_var, + &reordered_saved_var); + help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc); + help_diff_src_desc = + compress_spatial_dimensions_to_channel(help_diff_src_desc); + help_diff_dst_desc = + compress_spatial_dimensions_to_channel(help_diff_dst_desc); + } else { + if ((help_src_desc != help_diff_dst_desc) || + (help_src_desc != help_diff_src_desc) || + (help_diff_dst_desc != help_diff_src_desc)) { + help_src_desc = bn_reorder_memory_to_channel_major_format( + true, help_src_desc, src, &reordered_src); + help_diff_dst_desc = bn_reorder_memory_to_channel_major_format( + true, help_diff_dst_desc, diff_dst, &reordered_diff_dst); + help_diff_src_desc = bn_reorder_memory_to_channel_major_format( + false, help_diff_src_desc, diff_src, &reordered_diff_src); + actual_diff_src_desc = help_diff_src_desc; + } + } + + help_diff_scale_bias_desc = + get_bn_scale_bias_mean_var_desc(help_diff_scale_bias_desc, mode); + help_mean_var_desc = + get_bn_scale_bias_mean_var_desc(help_mean_var_desc, mode); + + auto forward_primitive = + create_primitive_desc<::dnnl::batch_normalization_forward>( + ::dnnl::prop_kind::forward_training, help_src_desc, + help_diff_dst_desc, epsilon, + ::dnnl::normalization_flags::use_scale | + ::dnnl::normalization_flags::use_shift); + auto primitive_args = + create_primitive_args_or_get<::dnnl::batch_normalization_backward>( + ::dnnl::prop_kind::backward, help_diff_src_desc, help_diff_dst_desc, + help_src_desc, epsilon, + ::dnnl::normalization_flags::use_scale | + ::dnnl::normalization_flags::use_shift, forward_primitive); + + void *dst_cache = nullptr; + if (!saved_mean && !saved_var) { + dst_cache = allocate(diff_dst_desc); + if (!reordered_saved_mean) { + reordered_saved_mean = allocate(mean_var_desc); + } + if (!reordered_saved_var) { + reordered_saved_var = allocate(mean_var_desc); + } + if (!bias) { + _q->fill(reordered_bias, 0, diff_scale_bias_desc.get_size()); + } + + batch_normalization_forward_internal( + true, mode, epsilon, 0.f, 1.f, src_desc, src, 0.f, diff_dst_desc, + dst_cache, diff_scale_bias_desc, scale, bias ? bias : reordered_bias, + mean_var_desc, reordered_saved_mean, reordered_saved_var, nullptr, + nullptr); + } + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc, + reordered_src ? reordered_src : src); + insert_arg(primitive_args.second.args, DNNL_ARG_SCALE, + help_diff_scale_bias_desc, + reordered_scale ? reordered_scale : scale); + insert_arg(primitive_args.second.args, DNNL_ARG_MEAN, help_mean_var_desc, + reordered_saved_mean ? reordered_saved_mean : saved_mean); + insert_arg(primitive_args.second.args, DNNL_ARG_VARIANCE, help_mean_var_desc, + reordered_saved_var ? reordered_saved_var : saved_var); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, help_diff_src_desc, + reordered_diff_dst ? reordered_diff_dst : diff_dst); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, help_diff_src_desc, + reordered_diff_src ? reordered_diff_src : diff_src); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SCALE, + help_diff_scale_bias_desc, + reordered_diff_scale ? reordered_diff_scale : diff_scale); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SHIFT, + help_diff_scale_bias_desc, + reordered_diff_bias ? reordered_diff_bias : diff_bias); + + sycl::event e = execute_primitive<::dnnl::batch_normalization_backward>( + primitive_args, + {{alpha_data, beta_data, DNNL_ARG_DIFF_SRC, help_diff_src_desc, + reordered_diff_src ? reordered_diff_src : diff_src}, + {alpha_param, beta_param, DNNL_ARG_DIFF_SCALE, help_diff_scale_bias_desc, + reordered_diff_scale ? reordered_diff_scale : diff_scale}, + {alpha_param, beta_param, DNNL_ARG_DIFF_SHIFT, help_diff_scale_bias_desc, + reordered_diff_bias ? reordered_diff_bias : diff_bias}}); + if (actual_diff_src_desc != diff_src_desc.get_desc() && reordered_diff_src) { + e = async_reorder(1.f, actual_diff_src_desc, reordered_diff_src, 0.f, + diff_src_desc, diff_src); + } + if (actual_diff_scale_bias_desc != diff_scale_bias_desc.get_desc() && + reordered_diff_scale && reordered_diff_bias) { + async_reorder(1.f, actual_diff_scale_bias_desc, reordered_diff_scale, 0.f, + diff_scale_bias_desc, diff_scale); + e = async_reorder(1.f, actual_diff_scale_bias_desc, reordered_diff_bias, 0.f, + diff_scale_bias_desc, diff_bias); + } + return exit_primitive(e); +} + +inline +sycl::event engine_ext::batch_normalization_forward_internal( + bool is_infer, batch_normalization_mode mode, float epsilon, float factor, + float alpha, const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &scale_bias_desc, void *scale, void *bias, + const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var, + void *running_mean, void *running_var) { + if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) { + return sycl::event(); + } + enter_primitive(src_desc.get_size() + 5 * dst_desc.get_size() + + scale_bias_desc.get_size() * 2 + + mean_var_desc.get_size() * 9); + void *reordered_src = nullptr, *reordered_dst = nullptr, + *reordered_scale = nullptr, *reordered_bias = nullptr, + *reordered_saved_mean = nullptr, *reordered_saved_var = nullptr; + ::dnnl::memory::desc help_src_desc = src_desc.get_desc(); + ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc(); + ::dnnl::memory::desc help_scale_bias_desc = scale_bias_desc.get_desc(); + ::dnnl::memory::desc help_mean_var_desc = mean_var_desc.get_desc(); + ::dnnl::memory::desc actual_dst_desc = help_dst_desc; + ::dnnl::memory::desc actual_mean_var_desc = help_mean_var_desc; + + if (mode == batch_normalization_mode::per_activation) { + help_src_desc = bn_reorder_memory_to_channel_major_format(true, help_src_desc, src, + &reordered_src); + help_dst_desc = bn_reorder_memory_to_channel_major_format( + false, help_dst_desc, dst, &reordered_dst); + actual_dst_desc = help_dst_desc; + help_scale_bias_desc = bn_reorder_memory_to_channel_major_format( + true, help_scale_bias_desc, scale, &reordered_scale); + bn_reorder_memory_to_channel_major_format(true, help_scale_bias_desc, bias, + &reordered_bias); + help_mean_var_desc = bn_reorder_memory_to_channel_major_format( + is_infer, help_mean_var_desc, saved_mean, + &reordered_saved_mean); + actual_mean_var_desc = help_mean_var_desc; + bn_reorder_memory_to_channel_major_format(is_infer, + help_mean_var_desc, saved_var, + &reordered_saved_var); + help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc); + help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc); + } else { + if (help_src_desc != help_dst_desc) { + help_src_desc = bn_reorder_memory_to_channel_major_format( + true, help_src_desc, src, &reordered_src); + help_dst_desc = bn_reorder_memory_to_channel_major_format( + false, help_dst_desc, dst, &reordered_dst); + actual_dst_desc = help_dst_desc; + } + } + help_scale_bias_desc = + get_bn_scale_bias_mean_var_desc(help_scale_bias_desc, mode); + help_mean_var_desc = + get_bn_scale_bias_mean_var_desc(help_mean_var_desc, mode); + + ::dnnl::prop_kind kind; + ::dnnl::normalization_flags flag = ::dnnl::normalization_flags::use_scale | + ::dnnl::normalization_flags::use_shift; + if (is_infer) { + kind = ::dnnl::prop_kind::forward_inference; + flag = ::dnnl::normalization_flags::use_global_stats | flag; + } else { + kind = ::dnnl::prop_kind::forward_training; + } + auto primitive_args = + create_primitive_args_or_get<::dnnl::batch_normalization_forward>( + kind, help_src_desc, help_dst_desc, epsilon, flag); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc, + reordered_src ? reordered_src : src); + insert_arg(primitive_args.second.args, DNNL_ARG_SCALE, help_scale_bias_desc, + reordered_scale ? reordered_scale : scale); + insert_arg(primitive_args.second.args, DNNL_ARG_SHIFT, help_scale_bias_desc, + reordered_bias ? reordered_bias : bias); + insert_arg(primitive_args.second.args, DNNL_ARG_MEAN, help_mean_var_desc, + reordered_saved_mean ? reordered_saved_mean + : saved_mean); + insert_arg(primitive_args.second.args, DNNL_ARG_VARIANCE, help_mean_var_desc, + reordered_saved_var ? reordered_saved_var + : saved_var); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc, + reordered_dst ? reordered_dst : dst); + sycl::event e = execute_primitive<::dnnl::batch_normalization_forward>(primitive_args, + {{alpha, beta, DNNL_ARG_DST, help_dst_desc, + reordered_dst ? reordered_dst : dst}}); + + if (!is_infer && running_var) { + auto src_ndim = src_desc.get_desc().get_ndims(); + auto src_dims = src_desc.get_dims(); + int element_num = src_dims[0]; + if (mode == batch_normalization_mode::spatial) { + for (int index = 2; index < src_ndim; index++) { + element_num *= src_dims[index]; + } + } + float unbias_factor = element_num / (element_num - 1.f); + async_scale(1.f - factor, mean_var_desc, running_var); + e = async_sum(factor * unbias_factor, mean_var_desc, + reordered_saved_var ? reordered_saved_var : saved_var, + 1.f, mean_var_desc, running_var); + } + if (!is_infer && running_mean) { + e = async_sum(factor, mean_var_desc, + reordered_saved_mean ? reordered_saved_mean : saved_mean, + (1.f - factor), mean_var_desc, running_mean); + } + if (reordered_dst && (actual_dst_desc != dst_desc.get_desc())) { + e = async_reorder(1.f, actual_dst_desc, reordered_dst, 0.f, dst_desc, dst); + } + if (!is_infer && reordered_saved_mean && reordered_saved_var && saved_mean && + saved_var && (actual_mean_var_desc != mean_var_desc.get_desc())) { + e = async_reorder(1.f, actual_mean_var_desc, reordered_saved_mean, 0.f, + mean_var_desc, saved_mean); + e = async_reorder(1.f, actual_mean_var_desc, reordered_saved_var, 0.f, + mean_var_desc, saved_var); + } + return exit_primitive(e); +} + +inline +sycl::event engine_ext::rnn_forward_internal( + const rnn_desc &desc, ::dnnl::prop_kind kind, + const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter, + const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c, + size_t weight_size, void *weight, size_t workspace_size, void *workspace, + size_t scratchpad_size, void *scratchpad, bool is_get_execution_args, + size_t *weight_size_query, size_t *workspace_size_query, + size_t *scratchpad_size_query) { + ::dnnl::memory::data_type src_dt; + ::dnnl::memory::format_tag src_format_tag; + rnn_mode mode; + rnn_bias_mode bias_mode; + rnn_direction direction; + dpct::library_data_t dt; + int direction_num = 1, input_size = 0, hidden_size = 0, projection_size = 0, + layer_size = 0, gate_num = 1, output_size = 0, data_type_size = 0, + seq_length = 1, batch_size = 1; + std::vector data = {src, dst, src_iter, dst_iter, + src_iter_c, dst_iter_c, weight, workspace, + scratchpad}; + std::vector offset(6, 0); + void *input_layer_cache = nullptr, *hidden_layer_cache = nullptr; + sycl::event e; + enter_primitive(src_desc.get_size() * 2); + desc.get(&mode, &bias_mode, &direction, &dt, &input_size, &hidden_size, + &projection_size, &layer_size); + + get_rnn_configuration(src_desc.get_desc(), direction, mode, dt, hidden_size, + &src_dt, &src_format_tag, &projection_size, + &output_size, &seq_length, &batch_size, &direction_num, + &gate_num); + + if (direction == rnn_direction::bidirectional) { + // Here to combine the oneDNN bidirectional_sum and + // bidirectional_concat config, so call execute_rnn_forward_primitive + // twice. + if (layer_size > 1) { + if (!is_get_execution_args) { + input_layer_cache = allocate(src_desc); + hidden_layer_cache = allocate(src_desc); + _q->memcpy(input_layer_cache, src, src_desc.get_size()); + } + data[0] = input_layer_cache; + data[1] = hidden_layer_cache; + e = execute_rnn_forward_primitive( + mode, kind, ::dnnl::rnn_direction::bidirectional_sum, bias_mode, + src_dt, src_format_tag, seq_length, batch_size, output_size, + output_size, 1, direction_num, hidden_size, gate_num, projection_size, + data, offset, layer_size - 1, weight_size_query, workspace_size_query, + scratchpad_size_query); + data[0] = + ((layer_size - 1) % 2 == 0) ? input_layer_cache : hidden_layer_cache; + data[1] = dst; + } + e = execute_rnn_forward_primitive( + mode, kind, ::dnnl::rnn_direction::bidirectional_concat, bias_mode, + src_dt, src_format_tag, seq_length, batch_size, output_size, + 2 * output_size, 1, direction_num, hidden_size, gate_num, + projection_size, data, offset, 1, weight_size_query, + workspace_size_query, scratchpad_size_query); + } else { + e = execute_rnn_forward_primitive( + mode, kind, ::dnnl::rnn_direction::unidirectional_left2right, bias_mode, + src_dt, src_format_tag, seq_length, batch_size, output_size, + output_size, layer_size, direction_num, hidden_size, gate_num, + projection_size, data, offset, 1, weight_size_query, + workspace_size_query, scratchpad_size_query); + } + + return exit_primitive(e); +} + +inline +sycl::event engine_ext::execute_rnn_forward_primitive( + rnn_mode mode, ::dnnl::prop_kind kind, ::dnnl::rnn_direction direction, + rnn_bias_mode bias_mode, ::dnnl::memory::data_type dt, + ::dnnl::memory::format_tag tag, int seq_length, int batch_size, int src_c, + int dst_c, int layer_size, int direction_num, int hidden_size, int gate_num, + int projection_size, std::vector &data, std::vector &offset, + int iter_num, size_t *weight_size, size_t *workspace_size, + size_t *scratchpad_size) { + + sycl::event e; + ::dnnl::primitive *p = nullptr; + std::unordered_map *args = nullptr; + detail::primitive_cache_key_type key; + std::unordered_map *execution_args; + ::dnnl::memory::desc bias_desc( + {layer_size, direction_num, gate_num, hidden_size}, dt, + ::dnnl::memory::format_tag::ldgo); + ::dnnl::memory::desc weight_layer_desc( + {layer_size, direction_num, + projection_size ? projection_size : hidden_size, gate_num, hidden_size}, + dt, ::dnnl::memory::format_tag::ldigo); + ::dnnl::memory::desc weight_iter_desc( + {layer_size, direction_num, + projection_size ? projection_size : hidden_size, gate_num, hidden_size}, + dt, ::dnnl::memory::format_tag::ldigo); + ::dnnl::memory::desc projection_desc; + if (projection_size) { + projection_desc = ::dnnl::memory::desc( + {layer_size, direction_num, hidden_size, projection_size}, dt, + ::dnnl::memory::format_tag::ldio); + } + + if (weight_size) { + *weight_size += + (weight_layer_desc.get_size() + weight_iter_desc.get_size() + + projection_desc.get_size() + bias_desc.get_size()) * + iter_num; + return e; + } + + ::dnnl::memory::desc src_desc({seq_length, batch_size, src_c}, dt, tag); + ::dnnl::memory::desc dst_desc({seq_length, batch_size, dst_c}, dt, tag); + ::dnnl::memory::desc iter_desc( + {layer_size, direction_num, batch_size, + projection_size ? projection_size : hidden_size}, + dt, ::dnnl::memory::format_tag::ldnc); + ::dnnl::memory::desc iter_c_desc( + {layer_size, direction_num, batch_size, hidden_size}, dt, + ::dnnl::memory::format_tag::ldnc); + + ::dnnl::memory::desc workspace_desc; + ::dnnl::memory::desc scratchpad_desc; + ::dnnl::primitive_attr attr; + attr.set_scratchpad_mode(::dnnl::scratchpad_mode::user); + + if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) { + auto primitive = create_primitive_args_or_get<::dnnl::vanilla_rnn_forward>( + kind, + mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu + : ::dnnl::algorithm::eltwise_tanh, + direction, src_desc, iter_desc, weight_layer_desc, weight_iter_desc, + bias_desc, dst_desc, iter_desc, attr); + + auto pd = get_primitive_desc<::dnnl::vanilla_rnn_forward>( + primitive.second.primitive); + + workspace_desc = pd.workspace_desc(); + scratchpad_desc = pd.scratchpad_desc(); + if (workspace_size && scratchpad_size) { + *workspace_size += workspace_desc.get_size() * iter_num; + *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size + ? scratchpad_desc.get_size() + : *scratchpad_size; + } else { + key = primitive.first; + p = primitive.second.primitive; + args = primitive.second.args; + } + } else if (mode == rnn_mode::gru) { + auto primitive = create_primitive_args_or_get<::dnnl::gru_forward>( + kind, direction, src_desc, iter_desc, weight_layer_desc, + weight_iter_desc, bias_desc, dst_desc, iter_desc, attr); + + auto pd = + get_primitive_desc<::dnnl::gru_forward>(primitive.second.primitive); + + workspace_desc = pd.workspace_desc(); + scratchpad_desc = pd.scratchpad_desc(); + if (workspace_size && scratchpad_size) { + *workspace_size += workspace_desc.get_size() * iter_num; + *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size + ? scratchpad_desc.get_size() + : *scratchpad_size; + } else { + key = primitive.first; + p = primitive.second.primitive; + args = primitive.second.args; + } + } else if (mode == rnn_mode::lstm) { + auto primitive = create_primitive_args_or_get<::dnnl::lstm_forward>( + kind, direction, src_desc, iter_desc, iter_c_desc, weight_layer_desc, + weight_iter_desc, ::dnnl::memory::desc(), projection_desc, bias_desc, + dst_desc, iter_desc, iter_c_desc, attr); + + auto pd = + get_primitive_desc<::dnnl::lstm_forward>(primitive.second.primitive); + + workspace_desc = pd.workspace_desc(); + scratchpad_desc = pd.scratchpad_desc(); + if (workspace_size && scratchpad_size) { + *workspace_size += workspace_desc.get_size() * iter_num; + *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size + ? scratchpad_desc.get_size() + : *scratchpad_size; + } else { + key = primitive.first; + p = primitive.second.primitive; + args = primitive.second.args; + } + } + + for (int i = 0; i < iter_num; i++) { + void *in_cache = data[0], *out_cache = data[1], *dst_iter_c_cache = nullptr, + *dst_iter_cache = ((uint8_t *)(data[3]) + offset[1]); + if (mode == rnn_mode::lstm) { + dst_iter_c_cache = (uint8_t *)(data[4]) + offset[2]; + } + if (!workspace_size) { + insert_arg(args, DNNL_ARG_SRC_LAYER, src_desc, data[0]); + insert_arg(args, DNNL_ARG_DST_LAYER, dst_desc, data[1]); + insert_arg(args, DNNL_ARG_SCRATCHPAD, scratchpad_desc, data[8]); + auto insert_rnn_arg = [&](int arg_name, ::dnnl::memory::desc &d, void *data, + int &offset) { + insert_arg(args, arg_name, d, (uint8_t *)data + offset); + offset += d.get_size(); + }; + insert_rnn_arg(DNNL_ARG_SRC_ITER, iter_desc, data[2], offset[0]); + insert_rnn_arg(DNNL_ARG_DST_ITER, iter_desc, data[3], offset[1]); + + if (mode == rnn_mode::lstm) { + insert_rnn_arg(DNNL_ARG_SRC_ITER_C, iter_c_desc, data[4], offset[2]); + insert_rnn_arg(DNNL_ARG_DST_ITER_C, iter_c_desc, data[5], offset[3]); + } + insert_rnn_arg(DNNL_ARG_WEIGHTS_LAYER, weight_layer_desc, data[6], + offset[4]); + insert_rnn_arg(DNNL_ARG_WEIGHTS_ITER, weight_iter_desc, data[6], offset[4]); + if (projection_size) { + insert_rnn_arg(DNNL_ARG_WEIGHTS_PROJECTION, projection_desc, data[6], + offset[4]); + } + if (bias_mode == rnn_bias_mode::none) { + _q->memset((uint8_t *)(data[6]) + offset[4], 0, bias_desc.get_size()); + } + insert_rnn_arg(DNNL_ARG_BIAS, bias_desc, data[6], offset[4]); + if (kind == ::dnnl::prop_kind::forward_training) { + insert_rnn_arg(DNNL_ARG_WORKSPACE, workspace_desc, data[7], offset[5]); + } + if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) { + execute_primitive<::dnnl::vanilla_rnn_forward>( + {key, {static_cast<::dnnl::vanilla_rnn_forward *>(p), args}}); + } else if (mode == rnn_mode::gru) { + execute_primitive<::dnnl::gru_forward>( + {key, {static_cast<::dnnl::gru_forward *>(p), args}}); + } else if (mode == rnn_mode::lstm) { + execute_primitive<::dnnl::lstm_forward>( + {key, {static_cast<::dnnl::lstm_forward *>(p), args}}); + } + if (i != iter_num - 1) { + std::swap(data[0], data[1]); + } + } + if (kind == ::dnnl::prop_kind::forward_training) { + if (workspace_size) { + *workspace_size += + (src_desc.get_size() + dst_desc.get_size() + iter_desc.get_size()); + if (mode == rnn_mode::lstm) { + *workspace_size += iter_c_desc.get_size(); + } + } else { + _q->memcpy((uint8_t *)(data[7]) + offset[5], in_cache, + src_desc.get_size()); + offset[5] += src_desc.get_size(); + _q->memcpy((uint8_t *)(data[7]) + offset[5], out_cache, + dst_desc.get_size()); + offset[5] += dst_desc.get_size(); + _q->memcpy((uint8_t *)(data[7]) + offset[5], dst_iter_cache, + iter_desc.get_size()); + offset[5] += iter_desc.get_size(); + if (mode == rnn_mode::lstm) { + _q->memcpy((uint8_t *)(data[7]) + offset[5], dst_iter_c_cache, + iter_c_desc.get_size()); + offset[5] += iter_c_desc.get_size(); + } + } + } + } + return e; +} + +inline +sycl::event engine_ext::execute_rnn_backward_primitive( + rnn_mode mode, ::dnnl::rnn_direction direction, rnn_bias_mode bias_mode, + ::dnnl::memory::data_type dt, ::dnnl::memory::format_tag tag, + int seq_length, int batch_size, int src_c, int dst_c, int layer_size, + int direction_num, int hidden_size, int gate_num, int projection_size, + std::vector &data, std::vector &offset, int iter_num) { + + sycl::event e; + ::dnnl::primitive *p = nullptr; + std::unordered_map *args = nullptr; + detail::primitive_cache_key_type key; + ::dnnl::prop_kind fkind = ::dnnl::prop_kind::forward_training; + ::dnnl::prop_kind bkind = ::dnnl::prop_kind::backward; + ::dnnl::memory::desc bias_desc( + {layer_size, direction_num, gate_num, hidden_size}, dt, + ::dnnl::memory::format_tag::ldgo); + ::dnnl::memory::desc weight_layer_desc( + {layer_size, direction_num, + projection_size ? projection_size : hidden_size, gate_num, hidden_size}, + dt, ::dnnl::memory::format_tag::ldigo); + ::dnnl::memory::desc weight_iter_desc( + {layer_size, direction_num, + projection_size ? projection_size : hidden_size, gate_num, hidden_size}, + dt, ::dnnl::memory::format_tag::ldigo); + ::dnnl::memory::desc diff_weight_layer_desc( + {layer_size, direction_num, + projection_size ? projection_size : hidden_size, gate_num, hidden_size}, + dt, ::dnnl::memory::format_tag::ldgoi); + ::dnnl::memory::desc diff_weight_iter_desc( + {layer_size, direction_num, + projection_size ? projection_size : hidden_size, gate_num, hidden_size}, + dt, ::dnnl::memory::format_tag::ldgoi); + ::dnnl::memory::desc projection_desc, diff_projection_desc; + if (projection_size) { + projection_desc = ::dnnl::memory::desc( + {layer_size, direction_num, hidden_size, projection_size}, dt, + ::dnnl::memory::format_tag::ldio); + diff_projection_desc = ::dnnl::memory::desc( + {layer_size, direction_num, hidden_size, projection_size}, dt, + ::dnnl::memory::format_tag::ldoi); + } + + ::dnnl::memory::desc src_desc({seq_length, batch_size, src_c}, dt, tag); + ::dnnl::memory::desc dst_desc({seq_length, batch_size, dst_c}, dt, tag); + ::dnnl::memory::desc iter_desc( + {layer_size, direction_num, batch_size, + projection_size ? projection_size : hidden_size}, + dt, ::dnnl::memory::format_tag::ldnc); + ::dnnl::memory::desc iter_c_desc( + {layer_size, direction_num, batch_size, hidden_size}, dt, + ::dnnl::memory::format_tag::ldnc); + + ::dnnl::memory::desc workspace_desc; + ::dnnl::memory::desc scratchpad_desc; + ::dnnl::primitive_attr attr; + attr.set_scratchpad_mode(::dnnl::scratchpad_mode::user); + + if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) { + auto fpd = create_primitive_desc<::dnnl::vanilla_rnn_forward>( + fkind, + mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu + : ::dnnl::algorithm::eltwise_tanh, + direction, src_desc, iter_desc, weight_layer_desc, weight_iter_desc, + bias_desc, dst_desc, iter_desc, attr); + auto primitive = create_primitive_args_or_get<::dnnl::vanilla_rnn_backward>( + bkind, + mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu + : ::dnnl::algorithm::eltwise_tanh, + direction, src_desc, iter_desc, diff_weight_layer_desc, + diff_weight_iter_desc, bias_desc, dst_desc, iter_desc, src_desc, + iter_desc, weight_layer_desc, weight_iter_desc, bias_desc, dst_desc, + iter_desc, fpd, attr); + auto pd = get_primitive_desc<::dnnl::vanilla_rnn_backward>( + primitive.second.primitive); + workspace_desc = pd.workspace_desc(); + scratchpad_desc = pd.scratchpad_desc(); + key = primitive.first; + p = primitive.second.primitive; + args = primitive.second.args; + } else if (mode == rnn_mode::gru) { + auto fpd = create_primitive_desc<::dnnl::gru_forward>( + fkind, direction, src_desc, iter_desc, weight_layer_desc, + weight_iter_desc, bias_desc, dst_desc, iter_desc, attr); + auto primitive = create_primitive_args_or_get<::dnnl::gru_backward>( + bkind, direction, src_desc, iter_desc, diff_weight_layer_desc, + diff_weight_iter_desc, bias_desc, dst_desc, iter_desc, src_desc, + iter_desc, weight_layer_desc, weight_iter_desc, bias_desc, dst_desc, + iter_desc, fpd, attr); + auto pd = + get_primitive_desc<::dnnl::gru_backward>(primitive.second.primitive); + workspace_desc = pd.workspace_desc(); + scratchpad_desc = pd.scratchpad_desc(); + key = primitive.first; + p = primitive.second.primitive; + args = primitive.second.args; + } else if (mode == rnn_mode::lstm) { + auto fpd = create_primitive_desc<::dnnl::lstm_forward>( + fkind, direction, src_desc, iter_desc, iter_c_desc, weight_layer_desc, + weight_iter_desc, ::dnnl::memory::desc(), projection_desc, bias_desc, + dst_desc, iter_desc, iter_c_desc, attr); + auto primitive = create_primitive_args_or_get<::dnnl::lstm_backward>( + bkind, direction, src_desc, iter_desc, iter_c_desc, + diff_weight_layer_desc, diff_weight_iter_desc, ::dnnl::memory::desc(), + diff_projection_desc, bias_desc, dst_desc, iter_desc, iter_c_desc, + src_desc, iter_desc, iter_c_desc, weight_layer_desc, weight_iter_desc, + ::dnnl::memory::desc(), projection_desc, bias_desc, dst_desc, iter_desc, + iter_c_desc, fpd, attr); + auto pd = + get_primitive_desc<::dnnl::lstm_backward>(primitive.second.primitive); + workspace_desc = pd.workspace_desc(); + scratchpad_desc = pd.scratchpad_desc(); + key = primitive.first; + p = primitive.second.primitive; + args = primitive.second.args; + } + + for (int i = 0; i < iter_num; i++) { + insert_arg(args, DNNL_ARG_DIFF_SRC_LAYER, src_desc, data[8]); + insert_arg(args, DNNL_ARG_DIFF_DST_LAYER, dst_desc, data[9]); + insert_arg(args, DNNL_ARG_SCRATCHPAD, scratchpad_desc, data[15]); + auto insert_rnn_arg = [&](int arg_name, ::dnnl::memory::desc &d, void *data, + int &offset) { + offset += d.get_size(); + insert_arg(args, arg_name, d, (uint8_t *)data - offset); + }; + if (mode == rnn_mode::lstm) { + insert_rnn_arg(DNNL_ARG_DST_ITER_C, iter_c_desc, data[7], offset[0]); + insert_rnn_arg(DNNL_ARG_SRC_ITER_C, iter_c_desc, data[4], offset[2]); + } + insert_rnn_arg(DNNL_ARG_DST_ITER, iter_desc, data[7], offset[0]); + insert_rnn_arg(DNNL_ARG_DST_LAYER, dst_desc, data[7], offset[0]); + insert_rnn_arg(DNNL_ARG_SRC_LAYER, src_desc, data[7], offset[0]); + insert_rnn_arg(DNNL_ARG_WORKSPACE, workspace_desc, data[7], offset[0]); + insert_rnn_arg(DNNL_ARG_SRC_ITER, iter_desc, data[2], offset[1]); + insert_rnn_arg(DNNL_ARG_BIAS, bias_desc, data[6], offset[3]); + if (projection_size) { + insert_rnn_arg(DNNL_ARG_WEIGHTS_PROJECTION, diff_projection_desc, data[6], + offset[3]); + } + insert_rnn_arg(DNNL_ARG_WEIGHTS_ITER, diff_weight_iter_desc, data[6], + offset[3]); + insert_rnn_arg(DNNL_ARG_WEIGHTS_LAYER, diff_weight_layer_desc, data[6], + offset[3]); + insert_rnn_arg(DNNL_ARG_DIFF_SRC_ITER, iter_desc, data[10], offset[4]); + insert_rnn_arg(DNNL_ARG_DIFF_DST_ITER, iter_desc, data[11], offset[5]); + if (mode == rnn_mode::lstm) { + insert_rnn_arg(DNNL_ARG_DIFF_SRC_ITER_C, iter_c_desc, data[12], offset[6]); + insert_rnn_arg(DNNL_ARG_DIFF_DST_ITER_C, iter_c_desc, data[13], offset[7]); + } + insert_rnn_arg(DNNL_ARG_DIFF_BIAS, bias_desc, data[14], offset[8]); + if (bias_mode == rnn_bias_mode::none) { + _q->memset((uint8_t *)(data[14]) - offset[8], 0, bias_desc.get_size()); + } + if (projection_size) { + insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_PROJECTION, projection_desc, data[14], + offset[8]); + } + insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_ITER, weight_iter_desc, data[14], + offset[8]); + insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_LAYER, weight_layer_desc, data[14], + offset[8]); + if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) { + e = execute_primitive<::dnnl::vanilla_rnn_backward>( + {key, {static_cast<::dnnl::vanilla_rnn_backward *>(p), args}}); + } else if (mode == rnn_mode::gru) { + e = execute_primitive<::dnnl::gru_backward>( + {key, {static_cast<::dnnl::gru_backward *>(p), args}}); + } else if (mode == rnn_mode::lstm) { + e = execute_primitive<::dnnl::lstm_backward>( + {key, {static_cast<::dnnl::lstm_backward *>(p), args}}); + } + if (i != iter_num - 1) { + std::swap(data[8], data[9]); + } + } + return e; +} + +#define EMPTY_CACHE_KEY(type) \ + template <> \ + inline void engine_ext::generate_cache_key(std::string & key_buffer, \ + const type &arg) {} + +EMPTY_CACHE_KEY(::dnnl::engine) +EMPTY_CACHE_KEY(::dnnl::convolution_forward::primitive_desc) +EMPTY_CACHE_KEY(::dnnl::eltwise_forward::primitive_desc) +EMPTY_CACHE_KEY(::dnnl::softmax_forward::primitive_desc) +EMPTY_CACHE_KEY(::dnnl::pooling_forward::primitive_desc) +EMPTY_CACHE_KEY(::dnnl::lrn_forward::primitive_desc) +EMPTY_CACHE_KEY(::dnnl::batch_normalization_forward::primitive_desc) +EMPTY_CACHE_KEY(::dnnl::vanilla_rnn_forward::primitive_desc) +EMPTY_CACHE_KEY(::dnnl::lstm_forward::primitive_desc) +EMPTY_CACHE_KEY(::dnnl::gru_forward::primitive_desc) +#undef EMPTY_CACHE_KEY + +template <> +inline void engine_ext::generate_cache_key>( + std::string &key_buffer, const std::vector &vec) { + key_buffer.append((char *)vec.data(), vec.size() * sizeof(float)); +} + +template <> +inline void engine_ext::generate_cache_key<::dnnl::primitive_attr>( + std::string &key_buffer, const ::dnnl::primitive_attr &attr) { + if (!attr) { + return; + } + auto math_mode = (uint8_t)attr.get_fpmath_mode(); + key_buffer.append((char *)&math_mode, sizeof(uint8_t)); +} + +template <> +inline void engine_ext::generate_cache_key<::dnnl::memory::dims>( + std::string &key_buffer, const ::dnnl::memory::dims &dims) { + key_buffer.append((char *)dims.data(), dims.size() * sizeof(int64_t)); +} + +template <> +inline void engine_ext::generate_cache_key<::dnnl::memory::desc>( + std::string &key_buffer, const ::dnnl::memory::desc &desc) { + uint8_t params[3] = {(uint8_t)desc.get_format_kind(), + (uint8_t)desc.get_ndims(), + (uint8_t)desc.get_data_type()}; + generate_cache_key(key_buffer, desc.get_inner_blks()); + generate_cache_key(key_buffer, desc.get_dims()); + generate_cache_key(key_buffer, desc.get_strides()); +} + +template +void engine_ext::generate_cache_key(std::string &key_buffer, const T &arg) { + key_buffer.append((char *)&arg, sizeof(T)); +} + +template +void engine_ext::generate_cache_key(std::string &key_buffer, const T &first_arg, + const args_type &...args) { + generate_cache_key(key_buffer, first_arg); + generate_cache_key(key_buffer, args...); +} + +template +std::pair +engine_ext::create_primitive_args_or_get(args_type &&...args) { + std::string buffer; + buffer.reserve(512); + generate_cache_key(buffer, std::forward(args)...); + buffer.append(std::to_string(_engine_id)); + auto value = _primitive_cache.get(buffer); + primitive_type *p = nullptr; + std::unordered_map *a = nullptr; + if (value) { + p = (primitive_type *)value->_primitive; + a = value->_args; + } else { + p = new primitive_type(create_primitive_desc( + std::forward(args)...)); + a = new std::unordered_map(); + } + return {buffer, {p, a}}; +} + +template +typename primitive_type::primitive_desc +engine_ext::get_primitive_desc(::dnnl::primitive *p) { + return typename primitive_type::primitive_desc( + const_cast(p->get_primitive_desc())); +} + +template +typename primitive_type::primitive_desc +engine_ext::create_primitive_desc(args_type &&...args) { + return typename primitive_type::primitive_desc( + *_eng, std::forward(args)...); +} + +inline +void engine_ext::fill(const memory_desc_ext &src_desc, void *src, + const void *valuePtr) { + async_fill(src_desc, src, valuePtr).wait(); +} + +inline +void engine_ext::reorder(float alpha, const memory_desc_ext &src_desc, + void *src, float beta, const memory_desc_ext &dst_desc, + void *dst) { + async_reorder(alpha, src_desc, src, beta, dst_desc, dst).wait(); +} + +inline +void engine_ext::scale(float alpha, const memory_desc_ext &src_desc, + void *src) { + async_scale(alpha, src_desc, src).wait(); +} +inline +void engine_ext::sum(float alpha, const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, void *dst) { + async_sum(alpha, src_desc, src, beta, dst_desc, dst).wait(); +} +inline +void engine_ext::activation_forward(activation_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst) { + async_activation_forward(desc, alpha, src_desc, src, beta, dst_desc, dst) + .wait(); +} +inline +void engine_ext::activation_backward( + activation_desc &desc, float alpha, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src) { + async_activation_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst, + src_desc, src, beta, diff_src_desc, diff_src) + .wait(); +} +inline +void engine_ext::pooling_forward(pooling_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst, + ::dnnl::memory *workspace) { + async_pooling_forward(desc, alpha, src_desc, src, beta, dst_desc, dst, + workspace).wait(); +} + +inline +void engine_ext::pooling_backward( + pooling_desc &desc, float alpha, const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src, + ::dnnl::memory *workspace) { + async_pooling_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst, + src_desc, src, beta, diff_src_desc, diff_src, + workspace) + .wait(); +} + +inline +void engine_ext::softmax_forward(softmax_algorithm alg, softmax_mode mode, + float alpha, const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &dst_desc, void *dst) { + async_softmax_forward(alg, mode, alpha, src_desc, src, beta, dst_desc, dst) + .wait(); +} + +inline +void engine_ext::softmax_backward(softmax_algorithm alg, softmax_mode mode, + float alpha, const memory_desc_ext &dst_desc, + void *dst, + const memory_desc_ext &diff_dst_desc, + void *diff_dst, float beta, + const memory_desc_ext &diff_src_desc, + void *diff_src) { + async_softmax_backward(alg, mode, alpha, dst_desc, dst, diff_dst_desc, + diff_dst, beta, diff_src_desc, diff_src) + .wait(); +} + +inline +void engine_ext::lrn_forward(lrn_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst, ::dnnl::memory *workspace) { + async_lrn_forward(desc, alpha, src_desc, src, beta, dst_desc, dst, workspace) + .wait(); +} + +inline +void engine_ext::lrn_backward(lrn_desc &desc, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, + void *diff_dst, const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &diff_src_desc, + void *diff_src, + ::dnnl::memory *workspace) { + async_lrn_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst, + src_desc, src, beta, diff_src_desc, diff_src, workspace) + .wait(); +} + +inline +sycl::event engine_ext::async_fill(const memory_desc_ext &src_desc, void *src, + const void *valuePtr) { + ::dnnl::memory::data_type dt = src_desc.get_desc().get_data_type(); + unsigned mem_size = src_desc.get_size(); + switch (dt) { + case ::dnnl::memory::data_type::f32: + return fill_with_type(_q, src, valuePtr, mem_size); + case ::dnnl::memory::data_type::f16: + return fill_with_type(_q, src, valuePtr, mem_size); + case ::dnnl::memory::data_type::s32: + return fill_with_type(_q, src, valuePtr, mem_size); + case ::dnnl::memory::data_type::s8: + return fill_with_type(_q, src, valuePtr, mem_size); + case ::dnnl::memory::data_type::u8: + return fill_with_type(_q, src, valuePtr, mem_size); + default: + throw std::runtime_error("async_fill: unsupported data type."); + } +} + +inline +sycl::event engine_ext::async_reorder(float alpha, const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &dst_desc, void *dst) { + if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) { + return sycl::event(); + } + enter_primitive(2 * dst_desc.get_size()); + + auto primitive_args = create_primitive_args_or_get<::dnnl::reorder>( + src_desc.get_desc(), *_eng, dst_desc.get_desc()); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + + return exit_primitive(execute_primitive<::dnnl::reorder>( + primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}})); +} + +inline +sycl::event engine_ext::async_scale(float alpha, const memory_desc_ext &src_desc, + void *src) { + if (alpha == 1.f) { + return sycl::event(); + } + size_t cache_size = src_desc.get_size(); + enter_primitive(cache_size); + void *src_cache = allocate(cache_size); + _q->memcpy(src_cache, src, cache_size); + auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>( + ::dnnl::prop_kind::forward_inference, ::dnnl::algorithm::eltwise_linear, + src_desc.get_desc(), src_desc.get_desc(), alpha, 0.f); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src_cache); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, src_desc.get_desc(), + src); + + return exit_primitive( + execute_primitive<::dnnl::eltwise_forward>(primitive_args)); +} + +inline sycl::event +engine_ext::async_sum(float alpha, const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, void *dst) { + if (alpha == 0.f && beta == 1.f) { + return sycl::event(); + } + size_t cache_size = dst_desc.get_size(); + enter_primitive(cache_size); + void *dst_cache = allocate(dst_desc); + _q->memcpy(dst_cache, dst, cache_size); + + auto primitive_args = create_primitive_args_or_get<::dnnl::sum>( + std::vector{alpha, beta}, + std::vector<::dnnl::memory::desc>{src_desc.get_desc(), + dst_desc.get_desc()}); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + insert_arg(primitive_args.second.args, DNNL_ARG_MULTIPLE_SRC, + src_desc.get_desc(), src); + insert_arg(primitive_args.second.args, DNNL_ARG_MULTIPLE_SRC + 1, + dst_desc.get_desc(), dst_cache); + + return exit_primitive(execute_primitive<::dnnl::sum>(primitive_args)); +} + +inline +sycl::event engine_ext::async_binary(binary_op op, float alpha_0, + const memory_desc_ext &src_desc_0, void *src_0, + float alpha_1, const memory_desc_ext &src_desc_1, + void *src_1, float beta, + const memory_desc_ext &dst_desc, void *dst) { + ::dnnl::algorithm onednn_algorithm; + switch (op) { + case binary_op::max: + onednn_algorithm = ::dnnl::algorithm::binary_max; + break; + case binary_op::min: + onednn_algorithm = ::dnnl::algorithm::binary_min; + break; + case binary_op::add: + onednn_algorithm = ::dnnl::algorithm::binary_add; + break; + case binary_op::sub: + onednn_algorithm = ::dnnl::algorithm::binary_sub; + break; + case binary_op::mul: + onednn_algorithm = ::dnnl::algorithm::binary_mul; + break; + case binary_op::div: + onednn_algorithm = ::dnnl::algorithm::binary_div; + break; + case binary_op::sqrt: + onednn_algorithm = ::dnnl::algorithm::eltwise_sqrt; + break; + case binary_op::neg: + onednn_algorithm = ::dnnl::algorithm::eltwise_linear; + break; + } + size_t src0_cache_size = src_desc_0.get_size(); + size_t src1_cache_size = src_desc_1.get_size(); + size_t dst_cache_size = dst_desc.get_size(); + enter_primitive(2 * src0_cache_size + 2 * src1_cache_size + + 5 * dst_cache_size); + if (onednn_algorithm == ::dnnl::algorithm::eltwise_sqrt || + onednn_algorithm == ::dnnl::algorithm::eltwise_linear) { + void *src_cache = nullptr, *dst_cache = nullptr; + src_cache = allocate(src0_cache_size); + dst_cache = allocate(dst_cache_size); + _q->memcpy(src_cache, src_0, src0_cache_size); + _q->memcpy(dst_cache, dst, dst_cache_size); + async_scale(alpha_0, src_desc_0, src_cache); + async_scale(beta, dst_desc, dst_cache); + + // Let the output = 1 - input to simulate the behavior of neg. + auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>( + ::dnnl::prop_kind::forward_inference, onednn_algorithm, + src_desc_0.get_desc(), dst_desc.get_desc(), -1.f, 1.f); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc_0.get_desc(), + src_cache); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + + execute_primitive<::dnnl::eltwise_forward>( + primitive_args, {{1.f, 0.f, DNNL_ARG_DST, dst_desc, dst}}); + return exit_primitive( + async_sum(1.f, dst_desc, dst_cache, 1.f, dst_desc, dst)); + } + + void *src_0_cache = nullptr, *src_1_cache = nullptr, *dst_cache = nullptr; + + src_0_cache = allocate(src0_cache_size); + src_1_cache = allocate(src1_cache_size); + dst_cache = allocate(dst_cache_size); + + _q->memcpy(src_0_cache, src_0, src0_cache_size); + _q->memcpy(src_1_cache, src_1, src1_cache_size); + _q->memcpy(dst_cache, dst, dst_cache_size); + + async_scale(alpha_0, src_desc_0, src_0_cache); + async_scale(alpha_1, src_desc_1, src_1_cache); + async_scale(beta, dst_desc, dst_cache); + + auto primitive_args = create_primitive_args_or_get<::dnnl::binary>( + onednn_algorithm, src_desc_0.get_desc(), src_desc_1.get_desc(), + dst_desc.get_desc()); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0, src_desc_0.get_desc(), + src_0_cache); + insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1, src_desc_1.get_desc(), + src_1_cache); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + + execute_primitive<::dnnl::binary>(primitive_args, + {{1.f, 0.f, DNNL_ARG_DST, dst_desc, dst}}); + return exit_primitive( + async_sum(1.f, dst_desc, dst_cache, 1.f, dst_desc, dst)); +} + +inline +sycl::event engine_ext::async_reduction(reduction_op op, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst) { + if (alpha == 0.f && beta == 1.f) { + return sycl::event(); + } + size_t src_cache_size = src_desc.get_size(); + size_t dst_cache_size = dst_desc.get_size(); + enter_primitive(3 * src_cache_size + 2 * dst_cache_size); + float p = 2.f; + ::dnnl::algorithm onednn_algorithm; + void *cache = nullptr; + switch (op) { + case reduction_op::amax: + cache = allocate(src_cache_size); + activation_desc adesc; + adesc.set_algorithm(::dnnl::algorithm::eltwise_abs); + async_activation_forward(adesc, 1.f, src_desc, src, 0.f, src_desc, cache); + onednn_algorithm = ::dnnl::algorithm::reduction_max; + src = cache; + break; + case reduction_op::max: + onednn_algorithm = ::dnnl::algorithm::reduction_max; + break; + case reduction_op::min: + onednn_algorithm = ::dnnl::algorithm::reduction_min; + break; + case reduction_op::sum: + onednn_algorithm = ::dnnl::algorithm::reduction_sum; + break; + case reduction_op::mean: + onednn_algorithm = ::dnnl::algorithm::reduction_mean; + break; + case reduction_op::mul: + onednn_algorithm = ::dnnl::algorithm::reduction_mul; + break; + case reduction_op::mul_no_zeros: + cache = allocate(src_cache_size); + transform_no_zero(src_desc, src, cache); + onednn_algorithm = ::dnnl::algorithm::reduction_mul; + src = cache; + break; + case reduction_op::norm1: + p = 1.f; + onednn_algorithm = ::dnnl::algorithm::reduction_norm_lp_power_p_sum; + break; + case reduction_op::norm2: + onednn_algorithm = ::dnnl::algorithm::reduction_norm_lp_sum; + break; + } + auto primitive_args = create_primitive_args_or_get<::dnnl::reduction>( + onednn_algorithm, src_desc.get_desc(), dst_desc.get_desc(), p, 0.f); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + + return exit_primitive(execute_primitive<::dnnl::reduction>( + primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}})); +} + +inline +sycl::event engine_ext::async_activation_forward(activation_desc &desc, float alpha, + const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &dst_desc, + void *dst) { + if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) { + return sycl::event(); + } + enter_primitive(2 * dst_desc.get_size()); + auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>( + ::dnnl::prop_kind::forward, desc.get_algorithm(), src_desc.get_desc(), + dst_desc.get_desc(), desc.get_alpha(), desc.get_beta()); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + + return exit_primitive(execute_primitive<::dnnl::eltwise_forward>( + primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}})); +} + +inline +sycl::event engine_ext::async_activation_backward( + activation_desc &desc, float alpha, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src) { + + if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) { + return sycl::event(); + } + enter_primitive(2 * diff_src_desc.get_size()); + ::dnnl::memory::desc data_desc = dst_desc.get_desc(); + auto alg = desc.get_algorithm(); + if ((alg == ::dnnl::algorithm::eltwise_clip) || + (alg == ::dnnl::algorithm::eltwise_linear) || + (alg == ::dnnl::algorithm::eltwise_swish)) { + data_desc = src_desc.get_desc(); + } + auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_backward>( + alg, diff_src_desc.get_desc(), diff_dst_desc.get_desc(), data_desc, + desc.get_alpha(), desc.get_beta(), + create_primitive_desc<::dnnl::eltwise_forward>( + ::dnnl::prop_kind::forward, alg, src_desc.get_desc(), + dst_desc.get_desc(), desc.get_alpha(), desc.get_beta())); + + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, + diff_dst_desc.get_desc(), diff_dst); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, + diff_src_desc.get_desc(), diff_src); + + return exit_primitive(execute_primitive<::dnnl::eltwise_backward>( + primitive_args, + {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}})); +} + +inline +sycl::event engine_ext::async_pooling_forward(pooling_desc &desc, float alpha, + const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &dst_desc, + void *dst, ::dnnl::memory *workspace) { + if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) { + return sycl::event(); + } + enter_primitive(2 * dst_desc.get_size()); + int pooling_dim = desc.get_stride().size(); + std::vector dilation(pooling_dim, 0); + auto primitive_args = + create_primitive_args_or_get<::dnnl::pooling_forward>( + ::dnnl::prop_kind::forward_training, desc.get_algorithm(), + src_desc.get_desc(), dst_desc.get_desc(), desc.get_stride(), + desc.get_kernel(), dilation, desc.get_padding(), desc.get_padding()); + auto pd = get_primitive_desc<::dnnl::pooling_forward>( + primitive_args.second.primitive); + ::dnnl::memory ws_mem(pd.workspace_desc(), *_eng); + if (workspace) { + *workspace = ws_mem; + } else { + insert_workspace(src, ws_mem); + } + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, ws_mem); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + + return exit_primitive(execute_primitive<::dnnl::pooling_forward>( + primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}})); +} + +inline +sycl::event engine_ext::async_pooling_backward( + pooling_desc &desc, float alpha, const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src, + ::dnnl::memory *workspace) { + if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) { + return sycl::event(); + } + enter_primitive(2 * diff_src_desc.get_size()); + int pooling_dim = desc.get_stride().size(); + std::vector dilation(pooling_dim, 0); + auto primitive_args = create_primitive_args_or_get<::dnnl::pooling_backward>( + desc.get_algorithm(), diff_src_desc.get_desc(), diff_dst_desc.get_desc(), + desc.get_stride(), desc.get_kernel(), dilation, desc.get_padding(), + desc.get_padding(), + create_primitive_desc<::dnnl::pooling_forward>( + ::dnnl::prop_kind::forward_training, desc.get_algorithm(), + src_desc.get_desc(), dst_desc.get_desc(), desc.get_stride(), + desc.get_kernel(), dilation, desc.get_padding(), desc.get_padding())); + + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, + diff_dst_desc.get_desc(), diff_dst); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, + diff_src_desc.get_desc(), diff_src); + + if (workspace) { + insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, *workspace); + } else { + insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, + get_workspace(src)); + } + + return exit_primitive(execute_primitive<::dnnl::pooling_backward>( + primitive_args, + {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}})); +} + +inline +sycl::event engine_ext::async_softmax_forward(softmax_algorithm alg, + softmax_mode mode, float alpha, + const memory_desc_ext &src_desc, + void *src, float beta, + const memory_desc_ext &dst_desc, + void *dst) { + if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) { + return sycl::event(); + } + + ::dnnl::memory::desc help_src_desc = src_desc.get_desc(); + ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc(); + if (mode == softmax_mode::instance) { + help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc); + help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc); + } + enter_primitive(2 * help_dst_desc.get_size()); + + ::dnnl::algorithm softmax_alg = ::dnnl::algorithm::softmax_accurate; + if (alg == softmax_algorithm::log) { + softmax_alg = ::dnnl::algorithm::softmax_log; + } + auto primitive_args = create_primitive_args_or_get<::dnnl::softmax_forward>( + ::dnnl::prop_kind::forward, softmax_alg, help_src_desc, + help_dst_desc, 1); + + insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc, dst); + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc, src); + + return exit_primitive(execute_primitive<::dnnl::softmax_forward>( + primitive_args, + {{alpha, beta, DNNL_ARG_DST, memory_desc_ext(help_dst_desc), dst}})); +} + +inline +sycl::event engine_ext::async_softmax_backward( + softmax_algorithm alg, softmax_mode mode, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src) { + if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) { + return sycl::event(); + } + ::dnnl::memory::desc help_diff_src_desc = diff_src_desc.get_desc(); + ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc(); + ::dnnl::memory::desc help_diff_dst_desc = diff_dst_desc.get_desc(); + if (mode == softmax_mode::instance) { + help_diff_src_desc = + compress_spatial_dimensions_to_channel(help_diff_src_desc); + help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc); + help_diff_dst_desc = + compress_spatial_dimensions_to_channel(help_diff_dst_desc); + } + enter_primitive(2 * help_diff_src_desc.get_size()); + + ::dnnl::algorithm softmax_alg = ::dnnl::algorithm::softmax_accurate; + if (alg == softmax_algorithm::log) { + softmax_alg = ::dnnl::algorithm::softmax_log; + } + + auto primitive_args = create_primitive_args_or_get<::dnnl::softmax_backward>( + softmax_alg, help_diff_src_desc, help_diff_dst_desc, help_dst_desc, 1, + create_primitive_desc<::dnnl::softmax_forward>( + ::dnnl::prop_kind::forward, softmax_alg, help_diff_src_desc, + help_dst_desc, 1)); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc, dst); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, help_diff_dst_desc, + diff_dst); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, help_diff_src_desc, + diff_src); + + return exit_primitive(execute_primitive<::dnnl::softmax_backward>( + primitive_args, {{alpha, beta, DNNL_ARG_DIFF_SRC, + memory_desc_ext(help_diff_src_desc), diff_src}})); +} + +inline +sycl::event engine_ext::async_lrn_forward(lrn_desc &desc, float alpha, + const memory_desc_ext &src_desc, void *src, + float beta, const memory_desc_ext &dst_desc, + void *dst, ::dnnl::memory *workspace) { + + if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) { + return sycl::event(); + } + enter_primitive(2 * dst_desc.get_size()); + auto primitive_args = create_primitive_args_or_get<::dnnl::lrn_forward>( + ::dnnl::prop_kind::forward_training, + ::dnnl::algorithm::lrn_across_channels, src_desc.get_desc(), + dst_desc.get_desc(), desc.get_local_size(), desc.get_alpha(), + desc.get_beta(), desc.get_k()); + auto pd = + get_primitive_desc<::dnnl::lrn_forward>(primitive_args.second.primitive); + ::dnnl::memory ws_mem(pd.workspace_desc(), *_eng); + if (workspace) { + *workspace = ws_mem; + } else { + insert_workspace(src, ws_mem); + } + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, ws_mem); + + return exit_primitive(execute_primitive<::dnnl::lrn_forward>( + primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}})); +} + +inline +sycl::event +engine_ext::async_lrn_backward(lrn_desc &desc, float alpha, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &diff_dst_desc, void *diff_dst, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src, + ::dnnl::memory *workspace) { + + if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) { + return sycl::event(); + } + enter_primitive(2 * diff_src_desc.get_size()); + auto primitive_args = create_primitive_args_or_get<::dnnl::lrn_backward>( + ::dnnl::algorithm::lrn_across_channels, diff_src_desc.get_desc(), + diff_dst_desc.get_desc(), src_desc.get_desc(), desc.get_local_size(), + desc.get_alpha(), desc.get_beta(), desc.get_k(), + create_primitive_desc<::dnnl::lrn_forward>( + ::dnnl::prop_kind::forward_training, + ::dnnl::algorithm::lrn_across_channels, src_desc.get_desc(), + dst_desc.get_desc(), desc.get_local_size(), desc.get_alpha(), + desc.get_beta(), desc.get_k())); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, + diff_dst_desc.get_desc(), diff_dst); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, + diff_src_desc.get_desc(), diff_src); + + if (workspace) { + insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, *workspace); + } else { + insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, + get_workspace(src)); + } + + return exit_primitive(execute_primitive<::dnnl::lrn_backward>( + primitive_args, + {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}})); +} + +inline +size_t engine_ext::get_batch_normalization_workspace_size( + batch_normalization_ops ops, const memory_desc_ext &src_desc) { + if(ops == batch_normalization_ops::none) { + return 0; + } + return src_desc.get_size(); +} + +inline +sycl::event engine_ext::async_batch_normalization_forward_inference( + batch_normalization_mode mode, float epsilon, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias, + void *mean, void *var) { + + return batch_normalization_forward_internal( + true, mode, epsilon, 0.f, alpha, src_desc, src, beta, dst_desc, dst, + scale_bias_mean_var_desc, scale, bias, scale_bias_mean_var_desc, mean, + var, nullptr, nullptr); +} + +inline +sycl::event engine_ext::async_batch_normalization_forward_inference( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &summand_desc, void *summand, + const memory_desc_ext &scale_bias_desc, void *scale, void *bias, + const memory_desc_ext &mean_var_desc, void *mean, void *var) { + + bool has_post_op = (ops != batch_normalization_ops::none); + sycl::event e; + enter_primitive(src_desc.get_size() + dst_desc.get_size() * 4 + + scale_bias_desc.get_size() * 2 + + mean_var_desc.get_size() * 5); + if (has_post_op) { + void *dst_cache = allocate(dst_desc); + batch_normalization_forward_internal( + true, mode, epsilon, 0.f, 1.f, src_desc, src, 0.f, dst_desc, dst_cache, + scale_bias_desc, scale, bias, mean_var_desc, mean, var, nullptr, + nullptr); + + if (ops == batch_normalization_ops::add_activation) { + async_sum(1.f, summand_desc, summand, 1.f, dst_desc, dst_cache); + } + async_activation_forward(adesc, 1.f, dst_desc, dst_cache, 0.f, dst_desc, + dst_cache); + return exit_primitive( + async_sum(alpha, dst_desc, dst_cache, beta, dst_desc, dst)); + } + return exit_primitive(batch_normalization_forward_internal( + true, mode, epsilon, 0.f, alpha, src_desc, src, beta, dst_desc, dst, + scale_bias_desc, scale, bias, mean_var_desc, mean, var, nullptr, + nullptr)); +} + +inline +sycl::event engine_ext::async_batch_normalization_forward_training( + batch_normalization_mode mode, float epsilon, float factor, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias, + void *running_mean, void *running_var, void *saved_mean, void *saved_var) { + return batch_normalization_forward_internal( + false, mode, epsilon, factor, alpha, src_desc, src, beta, dst_desc, dst, + scale_bias_mean_var_desc, scale, bias, scale_bias_mean_var_desc, + saved_mean, saved_var, running_mean, running_var); +} + +inline +sycl::event engine_ext::async_batch_normalization_forward_training( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float factor, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &summand_desc, void *summand, + const memory_desc_ext &scale_bias_desc, void *scale, void *bias, + const memory_desc_ext &mean_var_desc, void *running_mean, void *running_var, + void *saved_mean, void *saved_var, size_t workspace_size, + void *workspace) { + enter_primitive(src_desc.get_size() + dst_desc.get_size() * 3 + + mean_var_desc.get_size() * 5 + + scale_bias_desc.get_size() * 2); + bool has_post_op = (ops != batch_normalization_ops::none); + sycl::event e; + if (has_post_op) { + if(workspace_size < dst_desc.get_desc().get_size()) { + throw std::runtime_error("async_batch_normalization_forward_training_ex: " + "no sufficient workspace."); + } + batch_normalization_forward_internal( + false, mode, epsilon, factor, 1.f, src_desc, src, 0.f, dst_desc, + workspace, scale_bias_desc, scale, bias, mean_var_desc, + saved_mean, saved_var, running_mean, running_var); + if (ops == batch_normalization_ops::add_activation) { + async_sum(1.f, summand_desc, summand, 1.f, dst_desc, + workspace); + } + return exit_primitive(async_activation_forward( + adesc, alpha, dst_desc, workspace, beta, dst_desc, dst)); + } + return exit_primitive(batch_normalization_forward_internal( + false, mode, epsilon, factor, alpha, src_desc, src, beta, dst_desc, dst, + scale_bias_desc, scale, bias, mean_var_desc, saved_mean, saved_var, + running_mean, running_var)); +} + +inline +sycl::event engine_ext::async_batch_normalization_forward_training( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float factor, float alpha, + const memory_desc_ext &src_desc, void *src, float beta, + const memory_desc_ext &dst_desc, void *dst, + const memory_desc_ext &summand_desc, void *summand, + const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias, + void *running_mean, void *running_var, void *saved_mean, void *saved_var, + size_t workspace_size, void *workspace) { + return async_batch_normalization_forward_training( + mode, ops, adesc, epsilon, factor, alpha, src_desc, src, beta, dst_desc, + dst, summand_desc, summand, scale_bias_mean_var_desc, scale, bias, + scale_bias_mean_var_desc, running_mean, running_var, saved_mean, + saved_var, workspace_size, workspace); +} + +inline +sycl::event engine_ext::async_batch_normalization_backward( + batch_normalization_mode mode, float epsilon, float alpha_data, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data, + const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param, + const memory_desc_ext &diff_scale_bias_mean_var_desc, void *scale, + float beta_param, void *diff_scale, void *diff_bias, void *saved_mean, + void *saved_var) { + + return batch_normalization_backward_internal( + mode, epsilon, alpha_data, src_desc, src, diff_dst_desc, diff_dst, + beta_data, diff_src_desc, diff_src, alpha_param, + diff_scale_bias_mean_var_desc, scale, nullptr, beta_param, diff_scale, + diff_bias, diff_scale_bias_mean_var_desc, saved_mean, saved_var); +} + +inline +sycl::event engine_ext::async_batch_normalization_backward( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float alpha_data, + const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst, + float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src, + const memory_desc_ext &diff_summand_desc, void *diff_summand, + float alpha_param, const memory_desc_ext &diff_scale_bias_desc, void *scale, + void *bias, float beta_param, void *diff_scale, void *diff_bias, + const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var, + size_t workspace_size, void *workspace) { + std::vector caches; + ::dnnl::memory::desc real_diff_dst_desc = diff_dst_desc.get_desc(); + void *real_diff_dst = diff_dst; + + if (ops != batch_normalization_ops::none && + workspace_size < dst_desc.get_desc().get_size()) { + throw std::runtime_error("async_batch_normalization_backward_ex: " + "no sufficient workspace."); + } + enter_primitive(diff_scale_bias_desc.get_size() * 8 + + src_desc.get_size() * 3 + diff_dst_desc.get_size() * 5 + + diff_src_desc.get_size() + mean_var_desc.get_size() * 9 + + diff_summand_desc.get_size()); + if (ops == batch_normalization_ops::add_activation) { + void *diff_summand_cache = allocate(diff_summand_desc); + async_activation_backward(adesc, 1.f, dst_desc, dst, diff_dst_desc, diff_dst, + dst_desc, workspace, 0.f, + diff_summand_desc, diff_summand_cache); + async_sum(alpha_data, diff_summand_desc, diff_summand_cache, beta_data, + diff_summand_desc, diff_summand); + real_diff_dst_desc = diff_summand_desc.get_desc(); + real_diff_dst = diff_summand_cache; + } else if (ops == batch_normalization_ops::activation) { + void *diff_dst_cache = allocate(diff_dst_desc); + async_activation_backward(adesc, 1.f, dst_desc, dst, diff_dst_desc, + diff_dst, dst_desc, workspace, + 0.f, diff_dst_desc, diff_dst_cache); + real_diff_dst = diff_dst_cache; + } + + return exit_primitive(batch_normalization_backward_internal( + mode, epsilon, alpha_data, src_desc, src, real_diff_dst_desc, + real_diff_dst, beta_data, diff_src_desc, diff_src, alpha_param, + diff_scale_bias_desc, scale, bias, beta_param, diff_scale, diff_bias, + mean_var_desc, saved_mean, saved_var)); +} + +inline +sycl::event engine_ext::async_batch_normalization_backward( + batch_normalization_mode mode, batch_normalization_ops ops, + activation_desc &adesc, float epsilon, float alpha_data, + const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst, + float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src, + const memory_desc_ext &diff_summand_desc, void *diff_summand, + float alpha_param, const memory_desc_ext &diff_scale_bias_mean_var_desc, + void *scale, void *bias, float beta_param, void *diff_scale, + void *diff_bias, void *saved_mean, void *saved_var, + size_t workspace_size, void *workspace) { + + return async_batch_normalization_backward( + mode, ops, adesc, epsilon, alpha_data, src_desc, src, dst_desc, dst, + diff_dst_desc, diff_dst, beta_data, diff_src_desc, diff_src, + diff_summand_desc, diff_summand, alpha_param, + diff_scale_bias_mean_var_desc, scale, bias, beta_param, diff_scale, + diff_bias, diff_scale_bias_mean_var_desc, saved_mean, saved_var, + workspace_size, workspace); +} + +inline +sycl::event +engine_ext::async_convolution_forward(convolution_desc &desc, ::dnnl::algorithm alg, + float alpha, const memory_desc_ext &src_desc, + void *src, const memory_desc_ext &weight_desc, + void *weight, float beta, + const memory_desc_ext &dst_desc, void *dst) { + if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) { + return sycl::event(); + } + auto help_weight_desc = + get_group_weight_desc(desc.get_group_count(), weight_desc); + + ::dnnl::primitive_attr attr; + attr.set_fpmath_mode(desc.get_math_mode()); + + auto origin_src_md = src_desc.get_desc(); + auto origin_dst_md = dst_desc.get_desc(); + auto origin_weight_md = help_weight_desc; + auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md); + auto dst_md = transfer_memory_desc_to_format_tag_any(origin_dst_md); + auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md); + + auto primitive_args = + create_primitive_args_or_get<::dnnl::convolution_forward>( + ::dnnl::prop_kind::forward_training, alg, src_md, weight_md, dst_md, + desc.get_stride(), desc.get_dilate(), desc.get_padding(), + desc.get_padding(), attr); + + auto pd = get_primitive_desc<::dnnl::convolution_forward>( + primitive_args.second.primitive); + auto optimal_src_md = pd.src_desc(); + auto optimal_dst_md = pd.dst_desc(); + auto optimal_weight_md = pd.weights_desc(); + + enter_primitive( + optimal_src_md.get_size() * 3 + optimal_dst_md.get_size() * 5 + + optimal_weight_md.get_size() * 3 + origin_dst_md.get_size() * 2); + + void *optimal_src = src, *optimal_dst = dst, *optimal_weight = weight; + allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md, + optimal_src); + allocate_and_reorder_memory_to_optimal(origin_weight_md, weight, + optimal_weight_md, optimal_weight); + + if (beta == 0.f) { + if(origin_dst_md != optimal_dst_md) { + optimal_dst = allocate(optimal_dst_md); + } + } else { + allocate_and_reorder_memory_to_optimal(origin_dst_md, dst, optimal_dst_md, + optimal_dst); + } + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md, + optimal_src); + insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md, + optimal_weight); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, optimal_dst_md, + optimal_dst); + + auto e = execute_primitive<::dnnl::convolution_forward>( + primitive_args, + {{alpha, beta, DNNL_ARG_DST, optimal_dst_md, optimal_dst}}); + + if (origin_dst_md != optimal_dst_md) { + e = async_reorder(1.f, optimal_dst_md, optimal_dst, 0.f, origin_dst_md, + dst); + } + return exit_primitive(e); +} + +inline +sycl::event engine_ext::async_convolution_forward( + convolution_desc &desc, ::dnnl::algorithm alg, activation_desc &adesc, + float alpha_0, const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &weight_desc, void *weight, float alpha_1, + const memory_desc_ext &summand_desc, void *summand, + const memory_desc_ext &bias_desc, void *bias, + const memory_desc_ext &dst_desc, void *dst) { + + int channel_num = bias_desc.get_element_num(); + auto help_weight_desc = + get_group_weight_desc(desc.get_group_count(), weight_desc); + ::dnnl::memory::desc help_bias_desc = {{channel_num}, + bias_desc.get_desc().get_data_type(), + ::dnnl::memory::format_tag::a}; + auto origin_weight_md = help_weight_desc; + auto origin_bias_md = help_bias_desc; + auto origin_src_md = src_desc.get_desc(); + auto origin_dst_md = dst_desc.get_desc(); + auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md); + auto dst_md = transfer_memory_desc_to_format_tag_any(origin_dst_md); + auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md); + auto bias_md = transfer_memory_desc_to_format_tag_any(origin_bias_md); + + ::dnnl::primitive_attr attr; + attr.set_fpmath_mode(desc.get_math_mode()); + + auto primitive_args = + create_primitive_args_or_get<::dnnl::convolution_forward>( + ::dnnl::prop_kind::forward_training, alg, src_md, weight_md, bias_md, + dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(), + desc.get_padding(), attr); + + auto pd = get_primitive_desc<::dnnl::convolution_forward>( + primitive_args.second.primitive); + auto optimal_src_md = pd.src_desc(); + auto optimal_dst_md = pd.dst_desc(); + auto optimal_weight_md = pd.weights_desc(); + auto optimal_bias_md = pd.bias_desc(); + + enter_primitive(optimal_src_md.get_size() + 3 * optimal_weight_md.get_size() + + optimal_bias_md.get_size() + 7 * optimal_dst_md.get_size() + + summand_desc.get_size()); + + void *optimal_src = src, *optimal_dst = dst, *optimal_weight = weight, + *optimal_bias = bias; + allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md, + optimal_src); + allocate_and_reorder_memory_to_optimal(origin_weight_md, weight, + optimal_weight_md, optimal_weight); + allocate_and_reorder_memory_to_optimal(origin_bias_md, bias, optimal_bias_md, + optimal_bias); + if (origin_dst_md != optimal_dst_md) { + optimal_dst = allocate(optimal_dst_md); + } + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md, + optimal_src); + insert_arg(primitive_args.second.args, DNNL_ARG_BIAS, optimal_bias_md, + optimal_bias); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, optimal_dst_md, + optimal_dst); + + void *cache = nullptr; + if (alpha_0 != 1.f) { + cache = allocate(optimal_weight_md); + _q->memcpy(cache, optimal_weight, optimal_weight_md.get_size()); + async_scale(alpha_0, optimal_weight_md, cache); + insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md, + cache); + execute_primitive<::dnnl::convolution_forward>( + primitive_args, + {{1.f, 0.f, DNNL_ARG_DST, optimal_dst_md, optimal_dst}}); + } else { + insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md, + optimal_weight); + execute_primitive<::dnnl::convolution_forward>( + primitive_args, + {{1.f, 0.f, DNNL_ARG_DST, optimal_dst_md, optimal_dst}}); + } + if (origin_dst_md != optimal_dst_md) { + async_reorder(1.f, optimal_dst_md, optimal_dst, 0.f, origin_dst_md, dst); + } + async_sum(alpha_1, summand_desc, summand, 1.f, dst_desc, dst); + return exit_primitive( + async_activation_forward(adesc, 1.f, dst_desc, dst, 0.f, dst_desc, dst)); +} + +inline +sycl::event engine_ext::async_convolution_backward_data( + convolution_desc &desc, ::dnnl::algorithm alg, float alpha, + const memory_desc_ext &weight_desc, void *weight, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta, + const memory_desc_ext &diff_src_desc, void *diff_src) { + + if (scale_parameter_preprocess({{alpha, beta, diff_dst_desc, diff_dst}})) { + return sycl::event(); + } + + auto help_weight_desc = + get_group_weight_desc(desc.get_group_count(), weight_desc); + + auto origin_weight_md = help_weight_desc; + auto origin_diff_src_md = diff_src_desc.get_desc(); + auto origin_diff_dst_md = diff_dst_desc.get_desc(); + auto diff_src_md = transfer_memory_desc_to_format_tag_any(origin_diff_src_md); + auto diff_dst_md = transfer_memory_desc_to_format_tag_any(origin_diff_dst_md); + auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md); + + ::dnnl::primitive_attr attr; + attr.set_fpmath_mode(desc.get_math_mode()); + + auto forward_primitive = create_primitive_desc<::dnnl::convolution_forward>( + ::dnnl::prop_kind::forward_training, ::dnnl::algorithm::convolution_auto, + diff_src_md, weight_md, diff_dst_md, desc.get_stride(), desc.get_dilate(), + desc.get_padding(), desc.get_padding(), attr); + + auto primitive_args = + create_primitive_args_or_get<::dnnl::convolution_backward_data>( + ::dnnl::algorithm::convolution_auto, diff_src_md, weight_md, + diff_dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(), + desc.get_padding(), forward_primitive, attr); + + auto pd = get_primitive_desc<::dnnl::convolution_backward_data>( + primitive_args.second.primitive); + auto optimal_diff_src_md = pd.diff_src_desc(); + auto optimal_diff_dst_md = pd.diff_dst_desc(); + auto optimal_weight_md = pd.weights_desc(); + + enter_primitive(5 * optimal_diff_src_md.get_size() + + optimal_diff_dst_md.get_size() + + optimal_weight_md.get_size()); + + void *optimal_diff_src = diff_src, *optimal_diff_dst = diff_dst, + *optimal_weight = weight; + allocate_and_reorder_memory_to_optimal(origin_diff_dst_md, diff_dst, + optimal_diff_dst_md, optimal_diff_dst); + allocate_and_reorder_memory_to_optimal(origin_weight_md, weight, + optimal_weight_md, optimal_weight); + if (beta == 0.f) { + if (origin_diff_src_md != optimal_diff_src_md) { + optimal_diff_src = allocate(optimal_diff_src_md); + } + } else { + allocate_and_reorder_memory_to_optimal( + origin_diff_src_md, diff_src, optimal_diff_src_md, optimal_diff_src); + } + + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, optimal_diff_dst_md, + optimal_diff_dst); + insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md, + optimal_weight); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, optimal_diff_src_md, + optimal_diff_src); + + auto e = execute_primitive<::dnnl::convolution_backward_data>( + primitive_args, + {{alpha, beta, DNNL_ARG_DIFF_SRC, optimal_diff_src_md, optimal_diff_src}}); + + if (origin_diff_src_md != optimal_diff_src_md) { + e = async_reorder(1.f, optimal_diff_src_md, optimal_diff_src, 0.f, + origin_diff_src_md, diff_src); + } + return exit_primitive(e); +} + +inline +sycl::event engine_ext::async_convolution_backward_weight( + convolution_desc &desc, ::dnnl::algorithm alg, float alpha, + const memory_desc_ext &src_desc, void *src, + const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta, + const memory_desc_ext &diff_weight_desc, void *diff_weight) { + + if (scale_parameter_preprocess( + {{alpha, beta, diff_weight_desc, diff_weight}})) { + return sycl::event(); + } + + auto help_diff_weight_desc = + get_group_weight_desc(desc.get_group_count(), diff_weight_desc); + + ::dnnl::primitive_attr attr; + attr.set_fpmath_mode(desc.get_math_mode()); + + auto origin_diff_weight_md = help_diff_weight_desc; + auto origin_src_md = src_desc.get_desc(); + auto origin_diff_dst_md = diff_dst_desc.get_desc(); + auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md); + auto diff_dst_md = transfer_memory_desc_to_format_tag_any(origin_diff_dst_md); + auto diff_weight_md = + transfer_memory_desc_to_format_tag_any(origin_diff_weight_md); + + auto forward_primitive = create_primitive_desc<::dnnl::convolution_forward>( + ::dnnl::prop_kind::forward_training, ::dnnl::algorithm::convolution_auto, + src_md, diff_weight_md, diff_dst_md, desc.get_stride(), desc.get_dilate(), + desc.get_padding(), desc.get_padding(), attr); + + auto primitive_args = + create_primitive_args_or_get<::dnnl::convolution_backward_weights>( + ::dnnl::algorithm::convolution_auto, src_md, diff_weight_md, + diff_dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(), + desc.get_padding(), forward_primitive, attr); + + auto pd = get_primitive_desc<::dnnl::convolution_backward_weights>( + primitive_args.second.primitive); + auto optimal_src_md = pd.src_desc(); + auto optimal_diff_dst_md = pd.diff_dst_desc(); + auto optimal_diff_weight_md = pd.diff_weights_desc(); + + enter_primitive(optimal_diff_weight_md.get_size() * 5 + + optimal_diff_dst_md.get_size() + optimal_src_md.get_size()); + + void *optimal_src = src, *optimal_diff_dst = diff_dst, + *optimal_diff_weight = diff_weight; + allocate_and_reorder_memory_to_optimal(origin_diff_dst_md, diff_dst, + optimal_diff_dst_md, optimal_diff_dst); + allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md, + optimal_src); + if (beta == 0.f) { + if (origin_diff_weight_md != optimal_diff_weight_md) { + optimal_diff_weight = allocate(optimal_diff_weight_md); + } + } else { + allocate_and_reorder_memory_to_optimal(origin_diff_weight_md, diff_weight, + optimal_diff_weight_md, + optimal_diff_weight); + } + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md, + optimal_src); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, optimal_diff_dst_md, + optimal_diff_dst); + insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_WEIGHTS, + optimal_diff_weight_md, optimal_diff_weight); + + auto e = execute_primitive<::dnnl::convolution_backward_weights>( + primitive_args, {{alpha, beta, DNNL_ARG_DIFF_WEIGHTS, + optimal_diff_weight_md, optimal_diff_weight}}); + + if (origin_diff_weight_md != optimal_diff_weight_md) { + e = async_reorder(1.f, optimal_diff_weight_md, optimal_diff_weight, 0.f, + origin_diff_weight_md, diff_weight); + } + return exit_primitive(e); +} + +inline +sycl::event engine_ext::async_convolution_backward_bias( + float alpha, const memory_desc_ext &diff_dst_desc, void *diff_dst, + float beta, const memory_desc_ext &diff_bias_desc, void *diff_bias) { + return async_reduction(reduction_op::sum, alpha, diff_dst_desc, diff_dst, beta, + diff_bias_desc, diff_bias); +} + +inline +void engine_ext::rnn_get_weight_space_size(const rnn_desc &desc, + size_t *weight_space_size) { + *weight_space_size = 0; + rnn_forward_internal(desc, ::dnnl::prop_kind::forward_inference, + memory_desc_ext(), nullptr, memory_desc_ext(), nullptr, + memory_desc_ext(), nullptr, nullptr, memory_desc_ext(), + nullptr, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, true, + weight_space_size, nullptr, nullptr); + return; +} + +inline +void engine_ext::rnn_get_scratchpad_workspace_size( + const rnn_desc &desc, ::dnnl::prop_kind kind, + const memory_desc_ext &src_desc, size_t *scratchpad_size, + size_t *workspace_size) { + *workspace_size = 0; + *scratchpad_size = 0; + rnn_forward_internal(desc, kind, src_desc, nullptr, memory_desc_ext(), + nullptr, memory_desc_ext(), nullptr, nullptr, + memory_desc_ext(), nullptr, nullptr, 0, nullptr, 0, + nullptr, 0, nullptr, true, nullptr, workspace_size, + scratchpad_size); + return; +} + +inline +sycl::event engine_ext::async_rnn_forward( + const rnn_desc &desc, ::dnnl::prop_kind kind, + const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc, + void *dst, const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter, + const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c, + size_t weight_size, void *weight, size_t scratchpad_size, void *scratchpad, + size_t workspace_size, void *workspace) { + + return rnn_forward_internal( + desc, kind, src_desc, src, dst_desc, dst, iter_desc, src_iter, dst_iter, + iter_c_desc, src_iter_c, dst_iter_c, weight_size, weight, workspace_size, + workspace, scratchpad_size, scratchpad, false, nullptr, nullptr, + nullptr); +} + +inline +sycl::event engine_ext::async_rnn_backward( + const rnn_desc &desc, const memory_desc_ext &dst_desc, void *dst, + void *diff_dst, const memory_desc_ext &src_desc, void *src, void *diff_src, + const memory_desc_ext &iter_desc, void *src_iter, void *diff_dst_iter, + void *diff_src_iter, const memory_desc_ext &iter_c_desc, void *src_iter_c, + void *diff_dst_iter_c, void *diff_src_iter_c, size_t weight_size, + void *weight, void *diff_weight, size_t scratchpad_size, void *scratchpad, + size_t workspace_size, void *workspace) { + ::dnnl::memory::data_type src_dt; + ::dnnl::memory::format_tag src_format_tag; + rnn_mode mode; + rnn_memory_format_tag format_tag; + rnn_bias_mode bias_mode; + rnn_direction direction; + dpct::library_data_t dt; + int direction_num = 1, input_size = 0, hidden_size = 0, projection_size = 0, + layer_size = 0, gate_num = 1, output_size = 0, data_type_size = 0, + seq_length = 1, batch_size = 1; + void *last_layer_cache = nullptr; + void *hidden_layer_cache = nullptr; + sycl::event e; + enter_primitive(src_desc.get_size() * 2); + std::vector offset(9, 0); + std::vector data = { + src, + dst, + (uint8_t *)src_iter + iter_desc.get_size(), + nullptr, + (uint8_t *)src_iter_c + iter_c_desc.get_size(), + nullptr, + (uint8_t *)weight + weight_size, + (uint8_t *)workspace + workspace_size, + diff_src, + diff_dst, + (uint8_t *)diff_src_iter + iter_desc.get_size(), + (uint8_t *)diff_dst_iter + iter_desc.get_size(), + (uint8_t *)diff_src_iter_c + iter_c_desc.get_size(), + (uint8_t *)diff_dst_iter_c + iter_c_desc.get_size(), + (uint8_t *)diff_weight + weight_size, + scratchpad}; + + desc.get(&mode, &bias_mode, &direction, &dt, &input_size, &hidden_size, + &projection_size, &layer_size); + + get_rnn_configuration(src_desc.get_desc(), direction, mode, dt, hidden_size, + &src_dt, &src_format_tag, &projection_size, + &output_size, &seq_length, &batch_size, &direction_num, + &gate_num); + + if (direction == rnn_direction::bidirectional) { + if (layer_size > 1) { + last_layer_cache = allocate(src_desc); + hidden_layer_cache = allocate(src_desc); + data[8] = last_layer_cache; + } + e = execute_rnn_backward_primitive( + mode, ::dnnl::rnn_direction::bidirectional_concat, bias_mode, src_dt, + src_format_tag, seq_length, batch_size, output_size, 2 * output_size, 1, + direction_num, hidden_size, gate_num, projection_size, data, offset, 1); + if (layer_size > 1) { + data[8] = hidden_layer_cache; + data[9] = last_layer_cache; + e = execute_rnn_backward_primitive( + mode, ::dnnl::rnn_direction::bidirectional_sum, bias_mode, src_dt, + src_format_tag, seq_length, batch_size, output_size, output_size, 1, + direction_num, hidden_size, gate_num, projection_size, data, offset, + layer_size - 1); + _q->memcpy(diff_src, + ((layer_size - 1) % 2 == 0) ? last_layer_cache + : hidden_layer_cache, + src_desc.get_size()); + } + } else { + e = execute_rnn_backward_primitive( + mode, ::dnnl::rnn_direction::unidirectional_left2right, bias_mode, + src_dt, src_format_tag, seq_length, batch_size, output_size, + output_size, layer_size, direction_num, hidden_size, gate_num, + projection_size, data, offset, 1); + } + + return exit_primitive(e); +} + +inline +size_t engine_ext::get_dropout_state_size(){ +#ifndef __INTEL_MKL__ + throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) " + "Interfaces Project does not support this API."); +#else + auto r = get_internal_resource(_q); + if(r->random_engine_state_size == -1){ + auto rand_engine = rng_engine_t(*_q, 0); + r->random_engine_state_size = + oneapi::mkl::rng::get_state_size(rand_engine); + } + return r->random_engine_state_size; +#endif +} + +inline size_t +engine_ext::get_dropout_workspace_size(const memory_desc_ext &src_desc) { + return src_desc.get_size(); +} + +inline +sycl::event engine_ext::async_dropout_forward(dropout_desc &desc, + const memory_desc_ext &src_desc, + void *src, + const memory_desc_ext &dst_desc, + void *dst, void *workspace, + size_t workspace_size) { + if (workspace_size < src_desc.get_size()) { + throw std::runtime_error("async_dropout_forward: no sufficient workspace."); + } + enter_primitive(src_desc.get_size() * 2 + dst_desc.get_size() * 2); + float p = desc.get_probability(); + if (p == 1.f) { + return _q->memset(dst, 0, dst_desc.get_size()); + } else if (p == 0.f) { + return async_reorder(1.f, src_desc, src, 0.f, dst_desc, dst); + } + + float scale_factor = 1.f / (1.f - p); + void *cache = workspace; + + memory_desc_ext rng_data_desc( + ::dnnl::memory::desc(src_desc.get_dims(), ::dnnl::memory::data_type::s32, + src_desc.get_strides())); + if (src_desc.get_desc().get_data_type() != ::dnnl::memory::data_type::s32) { + cache = allocate(rng_data_desc); + } + + desc.generate(_q, get_dropout_state_size(), rng_data_desc.get_element_num(), + (std::int32_t *)cache); + + if (cache == workspace) { + async_scale(scale_factor, src_desc, workspace); + } else { + async_reorder(scale_factor, rng_data_desc, cache, 0.f, src_desc, workspace); + } + + auto primitive_args = create_primitive_args_or_get<::dnnl::binary>( + ::dnnl::algorithm::binary_mul, src_desc.get_desc(), src_desc.get_desc(), + dst_desc.get_desc()); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0, src_desc.get_desc(), + src); + insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1, src_desc.get_desc(), + workspace); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(), + dst); + + return exit_primitive(execute_primitive<::dnnl::binary>(primitive_args)); +} + +inline +sycl::event engine_ext::async_dropout_backward( + dropout_desc &desc, const memory_desc_ext &diff_dst_desc, + void *diff_dst, const memory_desc_ext &diff_src_desc, void *diff_src, + void *workspace, size_t workspace_size) { + enter_primitive(2 * diff_src_desc.get_size()); + float p = desc.get_probability(); + if (p == 1.f) { + return _q->memset(diff_src, 0, diff_src_desc.get_size()); + } else if (p == 0.f) { + return async_reorder(1.f, diff_dst_desc, diff_dst, 0.f, diff_src_desc, + diff_src); + } + + auto primitive_args = create_primitive_args_or_get<::dnnl::binary>( + ::dnnl::algorithm::binary_mul, diff_dst_desc.get_desc(), + diff_dst_desc.get_desc(), diff_src_desc.get_desc()); + + insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0, + diff_dst_desc.get_desc(), diff_dst); + insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1, + diff_dst_desc.get_desc(), workspace); + insert_arg(primitive_args.second.args, DNNL_ARG_DST, diff_src_desc.get_desc(), + diff_src); + + return exit_primitive(execute_primitive<::dnnl::binary>(primitive_args)); +} +} // namespace dnnl +} // namespace dpct + +#endif // __DPCT_DNNL_UTILS_HPP__ diff --git a/dpct/dpct.hpp b/dpct/dpct.hpp new file mode 100644 index 0000000000000..8cc312f0ea31d --- /dev/null +++ b/dpct/dpct.hpp @@ -0,0 +1,62 @@ +//==---- dpct.hpp ---------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_HPP__ +#define __DPCT_HPP__ + +#include +#include +#include +#include + +template class dpct_kernel_name; +template class dpct_kernel_scalar; + +#include "atomic.hpp" +#include "device.hpp" +#include "image.hpp" +#include "kernel.hpp" +#include "math.hpp" +#include "memory.hpp" +#include "util.hpp" + +#if defined(_MSC_VER) +#define __dpct_align__(n) __declspec(align(n)) +#define __dpct_inline__ __forceinline +#else +#define __dpct_align__(n) __attribute__((aligned(n))) +#define __dpct_inline__ __inline__ __attribute__((always_inline)) +#endif + +#if defined(_MSC_VER) +#define __dpct_noinline__ __declspec(noinline) +#else +#define __dpct_noinline__ __attribute__((noinline)) +#endif + +#define DPCT_COMPATIBILITY_TEMP (900) + +namespace dpct{ +enum error_code { success = 0, default_error = 999 }; +} + +#define DPCT_CHECK_ERROR(expr) \ + [&]() { \ + try { \ + expr; \ + return dpct::success; \ + } catch (std::exception const &e) { \ + std::cerr << e.what() << std::endl; \ + return dpct::default_error; \ + } \ + }() + +#define DPCT_PI_F (3.14159274101257f) +#define DPCT_PI (3.141592653589793115998) + +#endif // __DPCT_HPP__ diff --git a/dpct/dpl_extras/algorithm.h b/dpct/dpl_extras/algorithm.h new file mode 100644 index 0000000000000..7c98b7a2282f9 --- /dev/null +++ b/dpct/dpl_extras/algorithm.h @@ -0,0 +1,2419 @@ +//==---- algorithm.h ------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_ALGORITHM_H__ +#define __DPCT_ALGORITHM_H__ + +#include +#include +#include + +#include "functional.h" +#include "iterators.h" +#include "vector.h" + +namespace dpct { + +template +void replace_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p, + const T &new_value) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + std::transform( + std::forward(policy), first, last, mask, first, + internal::replace_if_fun::value_type, + Pred>(p, new_value)); +} + +template +Iter3 replace_copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, + Iter3 result, Pred p, const T &new_value) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + return std::transform( + std::forward(policy), first, last, mask, result, + internal::replace_if_fun::value_type, + Pred>(p, new_value)); +} + +template +internal::enable_if_hetero_execution_policy +remove_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using oneapi::dpl::make_zip_iterator; + using policy_type = typename std::decay::type; + using internal::__buffer; + using ValueType = typename std::iterator_traits::value_type; + + __buffer _tmp(std::distance(first, last)); + + auto end = std::copy_if( + policy, make_zip_iterator(first, mask), + make_zip_iterator(last, mask + std::distance(first, last)), + make_zip_iterator(_tmp.get(), oneapi::dpl::discard_iterator()), + internal::negate_predicate_key_fun(p)); + return std::copy(std::forward(policy), _tmp.get(), + std::get<0>(end.base()), first); +} + +template +typename std::enable_if::type>::value, + Iter1>::type +remove_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using oneapi::dpl::make_zip_iterator; + using policy_type = typename std::decay::type; + using ValueType = typename std::iterator_traits::value_type; + + std::vector _tmp(std::distance(first, last)); + + auto end = std::copy_if( + policy, make_zip_iterator(first, mask), + make_zip_iterator(last, mask + std::distance(first, last)), + make_zip_iterator(_tmp.begin(), oneapi::dpl::discard_iterator()), + internal::negate_predicate_key_fun(p)); + return std::copy(policy, _tmp.begin(), std::get<0>(end.base()), first); +} + +template +Iter3 remove_copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, + Iter3 result, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using oneapi::dpl::make_zip_iterator; + auto ret_val = std::remove_copy_if( + std::forward(policy), make_zip_iterator(first, mask), + make_zip_iterator(last, mask + std::distance(first, last)), + make_zip_iterator(result, oneapi::dpl::discard_iterator()), + internal::predicate_key_fun(p)); + return std::get<0>(ret_val.base()); +} + +template +std::pair unique(Policy &&policy, Iter1 keys_first, + Iter1 keys_last, Iter2 values_first, + BinaryPred binary_pred) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::unique( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first, values_first), + oneapi::dpl::make_zip_iterator( + keys_last, values_first + std::distance(keys_first, keys_last)), + internal::compare_key_fun(binary_pred)); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_first, values_first), ret_val); + return std::make_pair(keys_first + n1, values_first + n1); +} + +template +std::pair unique(Policy &&policy, Iter1 keys_first, + Iter1 keys_last, Iter2 values_first) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using T = typename std::iterator_traits::value_type; + return unique(std::forward(policy), keys_first, keys_last, + values_first, std::equal_to()); +} + +template +std::pair unique_copy(Policy &&policy, Iter1 keys_first, + Iter1 keys_last, Iter2 values_first, + Iter3 keys_result, Iter4 values_result, + BinaryPred binary_pred) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::unique_copy( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first, values_first), + oneapi::dpl::make_zip_iterator( + keys_last, values_first + std::distance(keys_first, keys_last)), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::unique_fun(binary_pred)); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +std::pair unique_copy(Policy &&policy, Iter1 keys_first, + Iter1 keys_last, Iter2 values_first, + Iter3 keys_result, Iter4 values_result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using T = typename std::iterator_traits::value_type; + auto comp = std::equal_to(); + return unique_copy(std::forward(policy), keys_first, keys_last, + values_first, keys_result, values_result, comp); +} + +template +Iter partition_point(Policy &&policy, Iter first, Iter last, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + if (std::is_partitioned(policy, first, last, p)) + return std::find_if_not(std::forward(policy), first, last, p); + else + return first; +} + +template +Iter3 copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, + Iter3 result, Pred pred) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::copy_if( + std::forward(policy), oneapi::dpl::make_zip_iterator(first, mask), + oneapi::dpl::make_zip_iterator(last, mask + std::distance(first, last)), + oneapi::dpl::make_zip_iterator(result, oneapi::dpl::discard_iterator()), + internal::predicate_key_fun(pred)); + return std::get<0>(ret_val.base()); +} + +template +Iter2 transform_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 result, + UnaryOperation unary_op, Pred pred) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using T = typename std::iterator_traits::value_type; + const auto n = std::distance(first, last); + std::for_each( + std::forward(policy), + oneapi::dpl::make_zip_iterator(first, result), + oneapi::dpl::make_zip_iterator(first, result) + n, + internal::transform_if_fun(pred, unary_op)); + return result + n; +} + +template +Iter3 transform_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, + Iter3 result, UnaryOperation unary_op, Pred pred) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using T = typename std::iterator_traits::value_type; + using Ref1 = typename std::iterator_traits::reference; + using Ref2 = typename std::iterator_traits::reference; + const auto n = std::distance(first, last); + std::for_each( + std::forward(policy), + oneapi::dpl::make_zip_iterator(first, mask, result), + oneapi::dpl::make_zip_iterator(first, mask, result) + n, + internal::transform_if_unary_zip_mask_fun( + pred, unary_op)); + return result + n; +} + +template +Iter4 transform_if(Policy &&policy, Iter1 first1, Iter1 last1, Iter2 first2, + Iter3 mask, Iter4 result, BinaryOperation binary_op, + Pred pred) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + const auto n = std::distance(first1, last1); + using ZipIterator = + typename oneapi::dpl::zip_iterator; + using T = typename std::iterator_traits::value_type; + std::for_each( + std::forward(policy), + oneapi::dpl::make_zip_iterator(first1, first2, mask, result), + oneapi::dpl::make_zip_iterator(last1, first2 + n, mask + n, result + n), + internal::transform_if_zip_mask_fun(pred, + binary_op)); + return result + n; +} + +template +void scatter(Policy &&policy, InputIter1 first, InputIter1 last, InputIter2 map, + OutputIter result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + oneapi::dpl::copy(policy, first, last, + oneapi::dpl::make_permutation_iterator(result, map)); +} + +template +OutputIter gather(Policy &&policy, InputIter1 map_first, InputIter1 map_last, + InputIter2 input_first, OutputIter result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto perm_begin = + oneapi::dpl::make_permutation_iterator(input_first, map_first); + const int n = ::std::distance(map_first, map_last); + + return oneapi::dpl::copy(policy, perm_begin, perm_begin + n, result); +} + +template +void scatter_if(Policy &&policy, InputIter1 first, InputIter1 last, + InputIter2 map, InputIter3 mask, OutputIter result, + Predicate pred) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + transform_if( + ::std::forward(policy), first, last, mask, + oneapi::dpl::make_permutation_iterator(result, map), + [=](auto &&v) { return v; }, [=](auto &&m) { return pred(m); }); +} + +template +void scatter_if(Policy &&policy, InputIter1 first, InputIter1 last, + InputIter2 map, InputIter3 mask, OutputIter result) { + scatter_if(::std::forward(policy), first, last, map, mask, result, + internal::no_op_fun()); +} + +template +OutputIter gather_if(Policy &&policy, InputIter1 map_first, InputIter1 map_last, + InputIter2 mask, InputIter3 input_first, OutputIter result, + Predicate pred) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto perm_begin = + oneapi::dpl::make_permutation_iterator(input_first, map_first); + const int n = std::distance(map_first, map_last); + + return transform_if( + ::std::forward(policy), perm_begin, perm_begin + n, mask, result, + [=](auto &&v) { return v; }, [=](auto &&m) { return pred(m); }); +} + +template +OutputIter gather_if(Policy &&policy, InputIter1 map_first, InputIter1 map_last, + InputIter2 mask, InputIter3 input_first, + OutputIter result) { + return gather_if(::std::forward(policy), map_first, map_last, mask, + input_first, result, internal::no_op_fun()); +} + +template +std::pair +merge(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, Iter2 keys_first2, + Iter2 keys_last2, Iter3 values_first1, Iter4 values_first2, + Iter5 keys_result, Iter6 values_result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto n1 = std::distance(keys_first1, keys_last1); + auto n2 = std::distance(keys_first2, keys_last2); + std::merge(std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator(keys_last1, values_first1 + n1), + oneapi::dpl::make_zip_iterator(keys_first2, values_first2), + oneapi::dpl::make_zip_iterator(keys_last2, values_first2 + n2), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun<>()); + return std::make_pair(keys_result + n1 + n2, values_result + n1 + n2); +} + +template +std::pair +merge(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, Iter2 keys_first2, + Iter2 keys_last2, Iter3 values_first1, Iter4 values_first2, + Iter5 keys_result, Iter6 values_result, Comp comp) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto n1 = std::distance(keys_first1, keys_last1); + auto n2 = std::distance(keys_first2, keys_last2); + std::merge(std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator(keys_last1, values_first1 + n1), + oneapi::dpl::make_zip_iterator(keys_first2, values_first2), + oneapi::dpl::make_zip_iterator(keys_last2, values_first2 + n2), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun(comp)); + return std::make_pair(keys_result + n1 + n2, values_result + n1 + n2); +} + +template +void iota(Policy &&policy, Iter first, Iter last, T init, T step) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using DiffSize = typename std::iterator_traits::difference_type; + std::transform( + std::forward(policy), oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(std::distance(first, last)), + first, internal::sequence_fun(init, step)); +} + +template +void iota(Policy &&policy, Iter first, Iter last, T init) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + iota(std::forward(policy), first, last, init, T(1)); +} + +template +void iota(Policy &&policy, Iter first, Iter last) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using DiffSize = typename std::iterator_traits::difference_type; + iota(std::forward(policy), first, last, DiffSize(0), DiffSize(1)); +} + +template +void sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last, + Iter2 values_first, Comp comp) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto first = oneapi::dpl::make_zip_iterator(keys_first, values_first); + auto last = first + std::distance(keys_first, keys_last); + std::sort(std::forward(policy), first, last, + internal::compare_key_fun(comp)); +} + +template +void sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last, + Iter2 values_first) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + sort(std::forward(policy), keys_first, keys_last, values_first, + internal::__less()); +} + +template +void stable_sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last, + Iter2 values_first, Comp comp) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + std::stable_sort( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first, values_first), + oneapi::dpl::make_zip_iterator( + keys_last, values_first + std::distance(keys_first, keys_last)), + internal::compare_key_fun(comp)); +} + +template +void stable_sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last, + Iter2 values_first) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + stable_sort(std::forward(policy), keys_first, keys_last, values_first, + internal::__less()); +} + +template +void for_each_index(Policy &&policy, Iter first, Iter last, Operator unary_op) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + using DiffSize = typename std::iterator_traits::difference_type; + std::transform( + std::forward(policy), oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(std::distance(first, last)), + first, unary_op); +} + +template +std::pair +set_intersection(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, + Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1, + Iter4 keys_result, Iter5 values_result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::set_intersection( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator( + keys_last1, values_first1 + std::distance(keys_first1, keys_last1)), + oneapi::dpl::make_zip_iterator(keys_first2, + oneapi::dpl::discard_iterator()), + oneapi::dpl::make_zip_iterator(keys_last2, + oneapi::dpl::discard_iterator()), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun<>()); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +std::pair +set_intersection(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, + Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1, + Iter4 keys_result, Iter5 values_result, Comp comp) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::set_intersection( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator( + keys_last1, values_first1 + std::distance(keys_first1, keys_last1)), + oneapi::dpl::make_zip_iterator(keys_first2, + oneapi::dpl::discard_iterator()), + oneapi::dpl::make_zip_iterator(keys_last2, + oneapi::dpl::discard_iterator()), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun(comp)); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +std::pair +set_symmetric_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, + Iter2 keys_first2, Iter2 keys_last2, + Iter3 values_first1, Iter4 values_first2, + Iter5 keys_result, Iter6 values_result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::set_symmetric_difference( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator( + keys_last1, values_first1 + std::distance(keys_first1, keys_last1)), + oneapi::dpl::make_zip_iterator(keys_first2, values_first2), + oneapi::dpl::make_zip_iterator( + keys_last2, values_first2 + std::distance(keys_first2, keys_last2)), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun<>()); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +std::pair +set_symmetric_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, + Iter2 keys_first2, Iter2 keys_last2, + Iter3 values_first1, Iter4 values_first2, + Iter5 keys_result, Iter6 values_result, Comp comp) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::set_symmetric_difference( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator( + keys_last1, values_first1 + std::distance(keys_first1, keys_last1)), + oneapi::dpl::make_zip_iterator(keys_first2, values_first2), + oneapi::dpl::make_zip_iterator( + keys_last2, values_first2 + std::distance(keys_first2, keys_last2)), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun(comp)); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +std::pair +set_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, + Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1, + Iter4 values_first2, Iter5 keys_result, Iter6 values_result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::set_difference( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator( + keys_last1, values_first1 + std::distance(keys_first1, keys_last1)), + oneapi::dpl::make_zip_iterator(keys_first2, values_first2), + oneapi::dpl::make_zip_iterator( + keys_last2, values_first2 + std::distance(keys_first2, keys_last2)), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun<>()); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +std::pair set_difference(Policy &&policy, Iter1 keys_first1, + Iter1 keys_last1, Iter2 keys_first2, + Iter2 keys_last2, Iter3 values_first1, + Iter4 values_first2, Iter5 keys_result, + Iter6 values_result, Comp comp) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::set_difference( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator( + keys_last1, values_first1 + std::distance(keys_first1, keys_last1)), + oneapi::dpl::make_zip_iterator(keys_first2, values_first2), + oneapi::dpl::make_zip_iterator( + keys_last2, values_first2 + std::distance(keys_first2, keys_last2)), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun(comp)); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +internal::enable_if_execution_policy> +set_union(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, + Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1, + Iter4 values_first2, Iter5 keys_result, Iter6 values_result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::set_union( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator( + keys_last1, values_first1 + std::distance(keys_first1, keys_last1)), + oneapi::dpl::make_zip_iterator(keys_first2, values_first2), + oneapi::dpl::make_zip_iterator( + keys_last2, values_first2 + std::distance(keys_first2, keys_last2)), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun<>()); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +internal::enable_if_execution_policy> +set_union(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, + Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1, + Iter4 values_first2, Iter5 keys_result, Iter6 values_result, + Comp comp) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::set_union( + std::forward(policy), + oneapi::dpl::make_zip_iterator(keys_first1, values_first1), + oneapi::dpl::make_zip_iterator( + keys_last1, values_first1 + std::distance(keys_first1, keys_last1)), + oneapi::dpl::make_zip_iterator(keys_first2, values_first2), + oneapi::dpl::make_zip_iterator( + keys_last2, values_first2 + std::distance(keys_first2, keys_last2)), + oneapi::dpl::make_zip_iterator(keys_result, values_result), + internal::compare_key_fun(comp)); + auto n1 = std::distance( + oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val); + return std::make_pair(keys_result + n1, values_result + n1); +} + +template +internal::enable_if_execution_policy> +stable_partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, + Iter3 out_true, Iter4 out_false, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto ret_val = std::partition_copy( + std::forward(policy), oneapi::dpl::make_zip_iterator(first, mask), + oneapi::dpl::make_zip_iterator(last, mask + std::distance(first, last)), + oneapi::dpl::make_zip_iterator(out_true, oneapi::dpl::discard_iterator()), + oneapi::dpl::make_zip_iterator(out_false, + oneapi::dpl::discard_iterator()), + internal::predicate_key_fun(p)); + return std::make_pair(std::get<0>(ret_val.first.base()), + std::get<0>(ret_val.second.base())); +} + +template +internal::enable_if_execution_policy> +stable_partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter3 out_true, + Iter4 out_false, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + return std::partition_copy(std::forward(policy), first, last, + out_true, out_false, p); +} + +template +internal::enable_if_execution_policy> +partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, + Iter3 out_true, Iter4 out_false, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + return stable_partition_copy(std::forward(policy), first, last, mask, + out_true, out_false, p); +} + +template +internal::enable_if_hetero_execution_policy +stable_partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + typedef typename std::decay::type policy_type; + internal::__buffer::value_type> _tmp( + std::distance(first, last)); + + std::copy(policy, mask, mask + std::distance(first, last), _tmp.get()); + + auto ret_val = + std::stable_partition(std::forward(policy), + oneapi::dpl::make_zip_iterator(first, _tmp.get()), + oneapi::dpl::make_zip_iterator( + last, _tmp.get() + std::distance(first, last)), + internal::predicate_key_fun(p)); + return std::get<0>(ret_val.base()); +} + +template +typename std::enable_if::type>::value, + Iter1>::type +stable_partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + typedef typename std::decay::type policy_type; + std::vector::value_type> _tmp( + std::distance(first, last)); + + std::copy(policy, mask, mask + std::distance(first, last), _tmp.begin()); + + auto ret_val = std::stable_partition( + std::forward(policy), + oneapi::dpl::make_zip_iterator(first, _tmp.begin()), + oneapi::dpl::make_zip_iterator(last, + _tmp.begin() + std::distance(first, last)), + internal::predicate_key_fun(p)); + return std::get<0>(ret_val.base()); +} + +template +internal::enable_if_execution_policy +partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + return stable_partition(std::forward(policy), first, last, mask, p); +} + +template +inline ::std::enable_if_t::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value> +sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in, + Iter4 values_out, ::std::int64_t n, bool descending = false, + int begin_bit = 0, + int end_bit = + sizeof(typename ::std::iterator_traits::value_type) * 8); + +template +inline ::std::enable_if_t::value && + dpct::internal::is_iterator::value> +sort_keys(Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n, + bool descending = false, int begin_bit = 0, + int end_bit = + sizeof(typename ::std::iterator_traits::value_type) * 8); + +namespace internal { + +// Transforms key to a specific bit range and sorts the transformed key +template +inline void transform_and_sort(Policy &&policy, Iter1 keys_in, Iter2 keys_out, + ::std::int64_t n, bool descending, int begin_bit, + int end_bit) { + using key_t_value_t = typename std::iterator_traits::value_type; + auto trans_key = + translate_key(begin_bit, end_bit); + + // Use of the comparison operator that is not simply std::greater() or + // std::less() will result in + // not using radix sort which will cost some performance. However, this is + // necessary to provide the transformation of the key to the bitrange + // desired. + auto partial_sort_with_comp = [&](const auto &comp) { + return oneapi::dpl::partial_sort_copy( + std::forward(policy), keys_in, keys_in + n, keys_out, + keys_out + n, [=](const auto a, const auto b) { + return comp(trans_key(a), trans_key(b)); + }); + }; + if (descending) + partial_sort_with_comp(::std::greater()); + else + partial_sort_with_comp(::std::less()); +} + +template +inline void sort_only(Policy &&policy, Iter1 keys_in, Iter2 keys_out, + ::std::int64_t n, bool descending) { + using key_t_value_t = typename ::std::iterator_traits::value_type; + + if constexpr (::std::is_floating_point::value) { + if (descending) { + // Comparison operator that is not std::greater() ensures stability of + // -0.0 and 0.0 + // at the cost of some performance because radix sort will not be used. + auto comp_descending = [=](const auto a, const auto b) { return a > b; }; + + oneapi::dpl::partial_sort_copy(::std::forward(policy), keys_in, + keys_in + n, keys_out, keys_out + n, + comp_descending); + } else { + // Comparison operator that is not std::less() ensures stability of -0.0 + // and 0.0 + // at the cost of some performance because radix sort will not be used. + auto comp_ascending = [=](const auto a, const auto b) { return a < b; }; + + oneapi::dpl::partial_sort_copy(::std::forward(policy), keys_in, + keys_in + n, keys_out, keys_out + n, + comp_ascending); + } + } else { + if (descending) { + oneapi::dpl::partial_sort_copy(::std::forward(policy), keys_in, + keys_in + n, keys_out, keys_out + n, + ::std::greater()); + } else { + + oneapi::dpl::partial_sort_copy(::std::forward(policy), keys_in, + keys_in + n, keys_out, keys_out + n); + } + } +} + +// Transforms key from a pair to a specific bit range and sorts the pairs by the +// transformed key +template +inline void +transform_and_sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out, + Iter3 values_in, Iter4 values_out, ::std::int64_t n, + bool descending, int begin_bit, int end_bit) { + using key_t_value_t = typename std::iterator_traits::value_type; + auto zip_input = oneapi::dpl::zip_iterator(keys_in, values_in); + auto zip_output = oneapi::dpl::zip_iterator(keys_out, values_out); + auto trans_key = + translate_key(begin_bit, end_bit); + + // Use of the comparison operator that is not simply std::greater() or + // std::less() will result in + // not using radix sort which will cost some performance. However, this is + // necessary to provide the transformation of the key to the bitrange desired + // and also to select the key from the zipped pair. + auto load_val = [=](const auto a) { return trans_key(std::get<0>(a)); }; + + auto partial_sort_with_comp = [&](const auto &comp) { + return oneapi::dpl::partial_sort_copy( + std::forward(policy), zip_input, zip_input + n, zip_output, + zip_output + n, [=](const auto a, const auto b) { + return comp(load_val(a), load_val(b)); + }); + }; + if (descending) + partial_sort_with_comp(::std::greater()); + else + partial_sort_with_comp(::std::less()); +} + +template +inline void sort_only_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out, + Iter3 values_in, Iter4 values_out, ::std::int64_t n, + bool descending) { + using key_t_value_t = typename ::std::iterator_traits::value_type; + auto zip_input = oneapi::dpl::zip_iterator(keys_in, values_in); + auto zip_output = oneapi::dpl::zip_iterator(keys_out, values_out); + + // Use of the comparison operator that is not simply std::greater() or + // std::less() will result in + // not using radix sort which will cost some performance. However, this is + // necessary to select the key from the zipped pair. + auto load_val = [=](const auto a) { return std::get<0>(a); }; + + auto partial_sort_with_comp = [&](const auto &comp) { + return oneapi::dpl::partial_sort_copy( + std::forward(policy), zip_input, zip_input + n, zip_output, + zip_output + n, [=](const auto a, const auto b) { + return comp(load_val(a), load_val(b)); + }); + }; + if (descending) + partial_sort_with_comp(::std::greater()); + else + partial_sort_with_comp(::std::less()); +} + +// overload for Iter2 != std::nullptr_t +template +typename ::std::enable_if::value>::type +sort_pairs_impl(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in, + Iter4 values_out, ::std::int64_t n, bool descending, + int begin_bit, int end_bit) { + using key_t_value_t = typename ::std::iterator_traits::value_type; + + int clipped_begin_bit = ::std::max(begin_bit, 0); + int clipped_end_bit = + ::std::min((::std::uint64_t)end_bit, sizeof(key_t_value_t) * 8); + int num_bytes = (clipped_end_bit - clipped_begin_bit - 1) / 8 + 1; + + auto transform_and_sort_pairs_f = [&](auto x) { + using T = typename ::std::decay_t; + internal::transform_and_sort_pairs( + ::std::forward(policy), keys_in, keys_out, values_in, + values_out, n, descending, clipped_begin_bit, clipped_end_bit); + }; + + if (clipped_end_bit - clipped_begin_bit == sizeof(key_t_value_t) * 8) { + internal::sort_only_pairs(::std::forward(policy), keys_in, keys_out, + values_in, values_out, n, descending); + } else if (num_bytes == 1) { + transform_and_sort_pairs_f.template operator()(0); + } else if (num_bytes == 2) { + transform_and_sort_pairs_f.template operator()(0); + } else if (num_bytes <= 4) { + transform_and_sort_pairs_f.template operator()(0); + } else // if (num_bytes <= 8) + { + transform_and_sort_pairs_f.template operator()<::std::uint64_t>(0); + } +} + +// overload for Iter2 == std::nullptr_t +template +typename ::std::enable_if<::std::is_null_pointer::value>::type +sort_pairs_impl(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in, + Iter4 values_out, ::std::int64_t n, bool descending, + int begin_bit, int end_bit) { + // create temporary keys_out to discard, memory footprint could be improved by + // a specialized iterator with a single + // unchanging dummy Iter1 element + using key_t_value_t = typename std::iterator_traits::value_type; + sycl::buffer temp_keys_out{sycl::range<1>(n)}; + internal::sort_pairs_impl(std::forward(policy), keys_in, + oneapi::dpl::begin(temp_keys_out), values_in, + values_out, n, descending, begin_bit, end_bit); +} + +template +inline void segmented_sort_pairs_by_parallel_sorts( + Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter4 values_in, + Iter3 values_out, ::std::int64_t n, ::std::int64_t nsegments, + Iter5 begin_offsets, Iter5 end_offsets, bool descending = false, + int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + using offset_type = typename ::std::iterator_traits::value_type; + ::std::vector host_accessible_offset_starts(nsegments); + ::std::vector host_accessible_offset_ends(nsegments); + // make offsets accessible on host + ::std::copy(policy, begin_offsets, begin_offsets + nsegments, + host_accessible_offset_starts.begin()); + ::std::copy(policy, end_offsets, end_offsets + nsegments, + host_accessible_offset_ends.begin()); + + for (::std::uint64_t i = 0; i < nsegments; i++) { + ::std::uint64_t segment_begin = host_accessible_offset_starts[i]; + ::std::uint64_t segment_end = + ::std::min(n, (::std::int64_t)host_accessible_offset_ends[i]); + if (segment_begin < segment_end) { + ::dpct::sort_pairs( + policy, keys_in + segment_begin, keys_out + segment_begin, + values_in + segment_begin, values_out + segment_begin, + segment_end - segment_begin, descending, begin_bit, end_bit); + } + } +} + +template +inline void segmented_sort_keys_by_parallel_sorts( + Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n, + ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets, + bool descending = false, int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + using offset_type = typename ::std::iterator_traits::value_type; + ::std::vector host_accessible_offset_starts(nsegments); + ::std::vector host_accessible_offset_ends(nsegments); + // make offsets accessible on host + ::std::copy(policy, begin_offsets, begin_offsets + nsegments, + host_accessible_offset_starts.begin()); + ::std::copy(policy, end_offsets, end_offsets + nsegments, + host_accessible_offset_ends.begin()); + + for (::std::uint64_t i = 0; i < nsegments; i++) { + ::std::uint64_t segment_begin = host_accessible_offset_starts[i]; + ::std::uint64_t segment_end = + ::std::min(n, (::std::int64_t)host_accessible_offset_ends[i]); + if (segment_begin < segment_end) { + ::dpct::sort_keys(policy, keys_in + segment_begin, + keys_out + segment_begin, segment_end - segment_begin, + descending, begin_bit, end_bit); + } + } +} + +template +inline void segmented_sort_pairs_by_parallel_for_of_sorts( + Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in, + Iter4 values_out, ::std::int64_t n, ::std::int64_t nsegments, + Iter5 begin_offsets, Iter5 end_offsets, bool descending = false, + int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + policy.queue().submit([&](sycl::handler &cgh) { + cgh.parallel_for(nsegments, [=](sycl::id<1> i) { + ::std::uint64_t segment_begin = begin_offsets[i]; + ::std::uint64_t segment_end = + ::std::min(n, (::std::int64_t)end_offsets[i]); + if (segment_begin == segment_end) { + return; + } + ::dpct::sort_pairs(::std::execution::seq, keys_in + segment_begin, + keys_out + segment_begin, values_in + segment_begin, + values_out + segment_begin, + segment_end - segment_begin, descending, begin_bit, + end_bit); + }); + }); + policy.queue().wait(); +} + +template +inline void segmented_sort_keys_by_parallel_for_of_sorts( + Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n, + ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets, + bool descending = false, int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + policy.queue().submit([&](sycl::handler &cgh) { + cgh.parallel_for(nsegments, [=](sycl::id<1> i) { + ::std::uint64_t segment_begin = begin_offsets[i]; + ::std::uint64_t segment_end = + ::std::min(n, (::std::int64_t)end_offsets[i]); + if (segment_begin == segment_end) { + return; + } + ::dpct::sort_keys(::std::execution::seq, keys_in + segment_begin, + keys_out + segment_begin, segment_end - segment_begin, + descending, begin_bit, end_bit); + }); + }); + policy.queue().wait(); +} + +template +inline void mark_segments(Policy &&policy, OffsetIteratorT begin_offsets, + OffsetIteratorT end_offsets, ::std::int64_t n, + ::std::int64_t nsegments, + sycl::buffer<::std::size_t, 1> segments) { + + ::std::size_t work_group_size = + policy.queue() + .get_device() + .template get_info(); + + auto sg_sizes = policy.queue() + .get_device() + .template get_info(); + ::std::size_t sub_group_size = sg_sizes.empty() ? 0 : sg_sizes.back(); + + float avg_seg_size = (float)n / (float)nsegments; + if (avg_seg_size > work_group_size) { + // If average segment size is larger than workgroup, use workgroup to + // coordinate to mark segments + policy.queue() + .submit([&](sycl::handler &h) { + auto segments_acc = segments.get_access(h); + h.parallel_for(work_group_size, ([=](sycl::id<1> id) { + for (::std::size_t seg = 0; seg < nsegments; seg++) { + ::std::size_t i = begin_offsets[seg]; + ::std::size_t end = end_offsets[seg]; + while (i + id < end) { + segments_acc[i + id] = seg; + i += work_group_size; + } + } + })); + }) + .wait(); + } else if (sub_group_size > 0 && avg_seg_size > sub_group_size / 2) { + // If average segment size is larger than half a subgroup, use subgroup to + // coordinate to mark segments + policy.queue() + .submit([&](sycl::handler &h) { + auto segments_acc = segments.get_access(h); + h.parallel_for( + sycl::nd_range<1>{work_group_size, work_group_size}, + ([=](sycl::nd_item<1> item) { + auto sub_group = item.get_sub_group(); + ::std::size_t num_subgroups = + sub_group.get_group_range().size(); + ::std::size_t local_size = sub_group.get_local_range().size(); + + ::std::size_t sub_group_id = sub_group.get_group_id(); + while (sub_group_id < nsegments) { + ::std::size_t subgroup_local_id = sub_group.get_local_id(); + ::std::size_t i = begin_offsets[sub_group_id]; + ::std::size_t end = end_offsets[sub_group_id]; + while (i + subgroup_local_id < end) { + segments_acc[i + subgroup_local_id] = sub_group_id; + i += local_size; + } + sub_group_id += num_subgroups; + } + })); + }) + .wait(); + } else { + // If average segment size is small as compared to subgroup, use single + // work item to mark each segment + policy.queue() + .submit([&](sycl::handler &h) { + auto segments_acc = segments.get_access(h); + h.parallel_for(nsegments, ([=](sycl::id<1> seg) { + for (::std::size_t i = begin_offsets[seg]; + i < end_offsets[seg]; i++) { + segments_acc[i] = seg; + } + })); + }) + .wait(); + } +} + +// The dpl_histogram namespace contains a temporary preview of an upcoming +// oneDPL histogram API. This namespace will be removed and replaced with +// corresponding calls to oneapi::dpl::histogram() +namespace dpl_histogram { + +template +constexpr inline auto __ceiling_div(const T1 &number, const T2 &divisor) { + return (number - 1) / divisor + 1; +} + +template +struct __evenly_divided_binhash_impl {}; + +template +struct __evenly_divided_binhash_impl { + T __minimum; + ::std::uint32_t __num_bins; + T __scale; + T __maximum; + __evenly_divided_binhash_impl(const T &min, const T &max, + const ::std::uint32_t &num_bins) + : __minimum(min), __maximum(max), __num_bins(num_bins), + __scale(T(num_bins) / (max - min)) {} + template std::uint32_t operator()(T2 &&value) const { + return ::std::uint32_t((::std::forward(value) - __minimum) * __scale); + } + + template bool is_valid(const T2 &value) const { + return value >= __minimum && value < __maximum; + } +}; + +// non floating point type +template +struct __evenly_divided_binhash_impl { + T __minimum; + ::std::uint32_t __num_bins; + T __range_size; + __evenly_divided_binhash_impl(const T &min, const T &max, + const ::std::uint32_t &num_bins) + : __minimum(min), __num_bins(num_bins), __range_size(max - min) {} + template ::std::uint32_t operator()(T2 &&value) const { + return ::std::uint32_t( + ((::std::uint64_t(::std::forward(value)) - __minimum) * + ::std::uint64_t(__num_bins)) / + __range_size); + } + + template bool is_valid(const T2 &value) const { + return value >= __minimum && value < __minimum + __range_size; + } +}; + +template +using __evenly_divided_binhash = + __evenly_divided_binhash_impl>; + +template struct __custom_range_binhash { + Range __boundaries; + __custom_range_binhash(Range boundaries) : __boundaries(boundaries) {} + + template ::std::uint32_t operator()(T &&value) const { + return (::std::upper_bound(__boundaries.begin(), __boundaries.end(), + ::std::forward(value)) - + __boundaries.begin()) - + 1; + } + + template bool is_valid(const T2 &value) const { + return value >= __boundaries[0] && + value < __boundaries[__boundaries.size() - 1]; + } +}; + +template +inline void __clear_wglocal_histograms(const HistAccessor &local_histogram, + const OffsetT &offset, + const Size &num_bins, + const sycl::nd_item<1> &self_item) { + ::std::uint32_t gSize = self_item.get_local_range()[0]; + ::std::uint32_t self_lidx = self_item.get_local_id(0); + ::std::uint8_t factor = __ceiling_div(num_bins, gSize); + ::std::uint8_t k; + _DPCT_PRAGMA_UNROLL + for (k = 0; k < factor - 1; k++) { + local_histogram[offset + gSize * k + self_lidx] = 0; + } + if (gSize * k + self_lidx < num_bins) { + local_histogram[offset + gSize * k + self_lidx] = 0; + } + self_item.barrier(sycl::access::fence_space::local_space); +} + +template +inline void __accum_local_register_iter(const Iter1 &in_acc, + const ::std::size_t &index, + HistReg *histogram, BinFunc func) { + const auto &x = in_acc[index]; + if (func.is_valid(x)) { + BinIdxType c = func(x); + histogram[c]++; + } +} + +template +inline void __accum_local_atomics_iter(const Iter1 &in_acc, + const ::std::size_t &index, + const HistAccessor &wg_local_histogram, + const OffsetT &offset, BinFunc func) { + using __histo_value_type = typename HistAccessor::value_type; + const auto &x = in_acc[index]; + if (func.is_valid(x)) { + BinIdxType c = func(x); + sycl::atomic_ref<__histo_value_type, sycl::memory_order::relaxed, + sycl::memory_scope::work_group, AddressSpace> + local_bin(wg_local_histogram[offset + c]); + local_bin++; + } +} + +template +inline void __reduce_out_histograms(const HistAccessorIn &in_histogram, + const OffsetT &offset, + const HistAccessorOut &out_histogram, + const Size &num_bins, + const sycl::nd_item<1> &self_item) { + ::std::uint32_t gSize = self_item.get_local_range()[0]; + ::std::uint32_t self_lidx = self_item.get_local_id(0); + ::std::uint8_t factor = __ceiling_div(num_bins, gSize); + ::std::uint8_t k; + + _DPCT_PRAGMA_UNROLL + for (k = 0; k < factor - 1; k++) { + sycl::atomic_ref + global_bin(out_histogram[gSize * k + self_lidx]); + global_bin += in_histogram[offset + gSize * k + self_lidx]; + } + if (gSize * k + self_lidx < num_bins) { + sycl::atomic_ref + global_bin(out_histogram[gSize * k + self_lidx]); + global_bin += in_histogram[offset + gSize * k + self_lidx]; + } +} + +template <::std::uint16_t ItersPerWorkItem, ::std::uint8_t BinsPerWorkItem, + typename BinType, typename Policy, typename Range1, typename Range2, + typename Size, typename IdxHashFunc, typename... Range3> +inline void __histogram_general_registers_local_reduction( + Policy &&policy, ::std::uint16_t work_group_size, Range1 &&input, + Range2 &&bins, const Size &num_bins, IdxHashFunc func, + Range3 &&...opt_range) { + const ::std::size_t N = input.size(); + using __local_histogram_type = ::std::uint32_t; + using __private_histogram_type = ::std::uint16_t; + + ::std::size_t segments = __ceiling_div(N, work_group_size * ItersPerWorkItem); + auto e = policy.queue().submit([&](auto &h) { + // Temporary use of stable non-public API from oneDPL, this function will + // be replaced with oneDPL call in an upcoming release. + oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...); + sycl::local_accessor<__local_histogram_type, 1> local_histogram( + sycl::range(num_bins), h); + h.parallel_for( + sycl::nd_range<1>(segments * work_group_size, work_group_size), + [=](sycl::nd_item<1> __self_item) { + using __bin_idx_type = ::std::uint8_t; + const ::std::size_t __self_lidx = __self_item.get_local_id(0); + const ::std::size_t __wgroup_idx = __self_item.get_group(0); + const ::std::size_t __seg_start = + work_group_size * ItersPerWorkItem * __wgroup_idx; + + __clear_wglocal_histograms(local_histogram, 0, num_bins, __self_item); + __private_histogram_type histogram[BinsPerWorkItem]; + _DPCT_PRAGMA_UNROLL + for (::std::uint8_t k = 0; k < BinsPerWorkItem; k++) { + histogram[k] = 0; + } + + if (__seg_start + work_group_size * ItersPerWorkItem < N) { + _DPCT_PRAGMA_UNROLL + for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) { + __accum_local_register_iter<__bin_idx_type>( + input, __seg_start + idx * work_group_size + __self_lidx, + histogram, func); + } + } else { + _DPCT_PRAGMA_UNROLL + for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) { + ::std::size_t __val_idx = + __seg_start + idx * work_group_size + __self_lidx; + if (__val_idx < N) { + __accum_local_register_iter<__bin_idx_type>(input, __val_idx, + histogram, func); + } + } + } + + _DPCT_PRAGMA_UNROLL + for (::std::uint8_t k = 0; k < num_bins; k++) { + sycl::atomic_ref<__local_histogram_type, + sycl::memory_order::relaxed, + sycl::memory_scope::work_group, + sycl::access::address_space::local_space> + local_bin(local_histogram[k]); + local_bin += histogram[k]; + } + + __self_item.barrier(sycl::access::fence_space::local_space); + + __reduce_out_histograms(local_histogram, 0, bins, num_bins, + __self_item); + }); + }); + e.wait(); +} + +template <::std::uint16_t ItersPerWorkItem, typename BinType, typename Policy, + typename Range1, typename Range2, typename Size, typename IdxHashFunc, + typename... Range3> +inline void __histogram_general_local_atomics(Policy &&policy, + ::std::uint16_t work_group_size, + Range1 &&input, Range2 &&bins, + const Size &num_bins, + IdxHashFunc func, + Range3 &&...opt_range) { + const ::std::size_t N = input.size(); + ::std::size_t segments = __ceiling_div(N, work_group_size * ItersPerWorkItem); + auto e = policy.queue().submit([&](auto &h) { + // Temporary use of stable non-public API from oneDPL, this function will + // be replaced with oneDPL call in an upcoming release. + oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...); + sycl::local_accessor<::std::uint32_t, 1> local_histogram( + sycl::range(num_bins), h); + h.parallel_for( + sycl::nd_range<1>(segments * work_group_size, work_group_size), + [=](sycl::nd_item<1> __self_item) { + using __bin_idx_type = ::std::uint16_t; + constexpr auto __atomic_address_space = + sycl::access::address_space::local_space; + const ::std::size_t __self_lidx = __self_item.get_local_id(0); + const ::std::uint32_t __wgroup_idx = __self_item.get_group(0); + const ::std::size_t __seg_start = + work_group_size * __wgroup_idx * ItersPerWorkItem; + + __clear_wglocal_histograms(local_histogram, 0, num_bins, __self_item); + + if (__seg_start + work_group_size * ItersPerWorkItem < N) { + _DPCT_PRAGMA_UNROLL + for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) { + __accum_local_atomics_iter<__bin_idx_type, + __atomic_address_space>( + input, __seg_start + idx * work_group_size + __self_lidx, + local_histogram, 0, func); + } + } else { + _DPCT_PRAGMA_UNROLL + for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) { + ::std::size_t __val_idx = + __seg_start + idx * work_group_size + __self_lidx; + if (__val_idx < N) { + __accum_local_atomics_iter<__bin_idx_type, + __atomic_address_space>( + input, __val_idx, local_histogram, 0, func); + } + } + } + __self_item.barrier(sycl::access::fence_space::local_space); + + __reduce_out_histograms(local_histogram, 0, bins, num_bins, + __self_item); + }); + }); + + e.wait(); +} + +template <::std::uint16_t __min_iters_per_work_item, typename BinType, + typename Policy, typename Range1, typename Range2, typename Size, + typename IdxHashFunc, typename... Range3> +inline void __histogram_general_private_global_atomics( + Policy &&policy, ::std::uint16_t work_group_size, Range1 &&input, + Range2 &&bins, const Size &num_bins, IdxHashFunc func, + Range3 &&...opt_range) { + + const ::std::size_t N = input.size(); + auto __global_mem_size = + policy.queue() + .get_device() + .template get_info(); + const ::std::size_t max_segments = + ::std::min(__global_mem_size / (num_bins * sizeof(BinType)), + __ceiling_div(N, work_group_size * __min_iters_per_work_item)); + const ::std::size_t iters_per_work_item = + __ceiling_div(N, max_segments * work_group_size); + ::std::size_t segments = + __ceiling_div(N, work_group_size * iters_per_work_item); + + sycl::buffer private_histograms( + sycl::range<1>(segments * num_bins)); + + auto e = policy.queue().submit([&](auto &h) { + // Temporary use of stable non-public API from oneDPL, this function will + // be replaced with oneDPL call in an upcoming release. + oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...); + sycl::accessor hacc_private(private_histograms, h, sycl::read_write, + sycl::no_init); + h.parallel_for( + sycl::nd_range<1>(segments * work_group_size, work_group_size), + [=](sycl::nd_item<1> __self_item) { + using __bin_idx_type = ::std::uint32_t; + constexpr auto __atomic_address_space = + sycl::access::address_space::global_space; + const ::std::size_t __self_lidx = __self_item.get_local_id(0); + const ::std::size_t __wgroup_idx = __self_item.get_group(0); + const ::std::size_t __seg_start = + work_group_size * iters_per_work_item * __wgroup_idx; + + __clear_wglocal_histograms(hacc_private, __wgroup_idx * num_bins, + num_bins, __self_item); + if (__seg_start + work_group_size * iters_per_work_item < N) { + for (::std::size_t idx = 0; idx < iters_per_work_item; idx++) { + __accum_local_atomics_iter<__bin_idx_type, + __atomic_address_space>( + input, __seg_start + idx * work_group_size + __self_lidx, + hacc_private, __wgroup_idx * num_bins, func); + } + } else { + for (::std::size_t idx = 0; idx < iters_per_work_item; idx++) { + ::std::size_t __val_idx = + __seg_start + idx * work_group_size + __self_lidx; + if (__val_idx < N) { + __accum_local_atomics_iter<__bin_idx_type, + __atomic_address_space>( + input, __val_idx, hacc_private, __wgroup_idx * num_bins, + func); + } + } + } + __self_item.barrier(sycl::access::fence_space::local_space); + + __reduce_out_histograms(hacc_private, + __wgroup_idx * num_bins, bins, + num_bins, __self_item); + }); + }); + e.wait(); +} + +template +inline Iter2 +__histogram_general_select_best(Policy &&policy, Iter1 first, Iter1 last, + Iter2 histogram_first, const Size &num_bins, + IdxHashFunc func, Range &&...opt_range) { + using __histo_value_type = typename ::std::iterator_traits::value_type; + auto __local_mem_size = + policy.queue() + .get_device() + .template get_info(); + constexpr ::std::uint8_t __max_registers = 16; + + // Temporary use of stable non-public API from oneDPL, this function will be + // replaced with oneDPL call in an upcoming release. + auto keep_bins = oneapi::dpl::__ranges::__get_sycl_range< + oneapi::dpl::__par_backend_hetero::access_mode::write, Iter2>(); + auto bins_buf = keep_bins(histogram_first, histogram_first + num_bins); + + oneapi::dpl::fill(policy, bins_buf.all_view().begin(), + bins_buf.all_view().end(), __histo_value_type(0)); + auto N = last - first; + if (N > 0) { + // Temporary use of stable non-public API from oneDPL, this function will + // be replaced with oneDPL call in an upcoming release. + auto keep_input = oneapi::dpl::__ranges::__get_sycl_range< + oneapi::dpl::__par_backend_hetero::access_mode::read, Iter1>(); + auto input_buf = keep_input(first, last); + + ::std::size_t max_work_group_size = + policy.queue() + .get_device() + .template get_info(); + ::std::size_t work_group_size = + ::std::min(max_work_group_size, ::std::size_t(1024)); + + if (num_bins < __max_registers) { + + // If bins fit into registers, use register private accumulation + __histogram_general_registers_local_reduction<32, 16, __histo_value_type>( + ::std::forward(policy), work_group_size, input_buf.all_view(), + bins_buf.all_view(), num_bins, func, + ::std::forward(opt_range)...); + } else if (num_bins * sizeof(__histo_value_type) < __local_mem_size) { + // If bins fit into SLM, use local atomics + + // Experimentally determined iters per work-item + if (N <= 524288) { + __histogram_general_local_atomics<4, __histo_value_type>( + ::std::forward(policy), work_group_size, + input_buf.all_view(), bins_buf.all_view(), num_bins, func, + ::std::forward(opt_range)...); + } else { + __histogram_general_local_atomics<32, __histo_value_type>( + ::std::forward(policy), work_group_size, + input_buf.all_view(), bins_buf.all_view(), num_bins, func, + ::std::forward(opt_range)...); + } + } else // Otherwise, use global atomics (private copies per workgroup) + { + // Experimentally determined iters per work-item + if (N <= 524288) { + __histogram_general_private_global_atomics<4, __histo_value_type>( + ::std::forward(policy), work_group_size, + input_buf.all_view(), bins_buf.all_view(), num_bins, func, + ::std::forward(opt_range)...); + } else { + __histogram_general_private_global_atomics<32, __histo_value_type>( + ::std::forward(policy), work_group_size, + input_buf.all_view(), bins_buf.all_view(), num_bins, func, + ::std::forward(opt_range)...); + } + } + } + return histogram_first + num_bins; +} + +template +inline ::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value, + Iter2> +histogram(Policy &&policy, Iter1 first, Iter1 last, Iter2 histogram_first, + const Size &num_bins, const T &first_bin_min_val, + const T &last_bin_max_val) { + return __histogram_general_select_best( + ::std::forward(policy), first, last, histogram_first, num_bins, + __evenly_divided_binhash(first_bin_min_val, last_bin_max_val, + num_bins)); +} + +template +inline ::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value, + Iter2> +histogram(Policy &&policy, Iter1 first, Iter1 last, Iter2 histogram_first, + Iter3 boundary_first, Iter3 boundary_last) { + // Temporary use of stable non-public API from oneDPL, this function will be + // replaced with oneDPL call in an upcoming release. + auto keep_boundaries = oneapi::dpl::__ranges::__get_sycl_range< + oneapi::dpl::__par_backend_hetero::access_mode::read, Iter3>(); + auto boundary_buf = keep_boundaries(boundary_first, boundary_last); + + return __histogram_general_select_best( + ::std::forward(policy), first, last, histogram_first, + (boundary_last - boundary_first) - 1, + __custom_range_binhash{boundary_buf.all_view()}, boundary_buf.all_view()); +} +} // end namespace dpl_histogram + +} // end namespace internal + +// Evenly Divided Histogram of a 1-D array +template +::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +histogram_even(Policy &&policy, Iter1 d_samples, Iter2 d_histogram, + int num_levels, T lower_level, T upper_level, Size num_samples) { + internal::dpl_histogram::histogram(::std::forward(policy), d_samples, + d_samples + num_samples, d_histogram, + num_levels - 1, lower_level, upper_level); +} + +// Evenly Divided Histogram of a 2-D ROI in a flattened 2-D array +template +::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +histogram_even_roi(Policy &&policy, Iter1 d_samples, Iter2 d_histogram, + int num_levels, T lower_level, T upper_level, + OffsetT num_row_samples, OffsetT num_rows, + ::std::size_t row_stride_bytes) { + return histogram_even( + ::std::forward(policy), + oneapi::dpl::permutation_iterator( + d_samples, + internal::__roi_2d_index_functor( + num_row_samples, + row_stride_bytes / + sizeof(typename ::std::iterator_traits::value_type))), + d_histogram, num_levels, lower_level, upper_level, + num_row_samples * num_rows); +} + +// Evenly Divided Multi-Channel Histogram of a 1-D array +template +::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +multi_histogram_even(Policy &&policy, Iter1 d_samples, + Iter2 d_histogram[NumActiveChannels], + int num_levels[NumActiveChannels], + T lower_level[NumActiveChannels], + T upper_level[NumActiveChannels], Size num_pixels) { + for (int active_channel = 0; active_channel < NumActiveChannels; + active_channel++) { + histogram_even( + policy, + oneapi::dpl::permutation_iterator( + d_samples, + internal::__interleaved_index_functor(NumChannels, active_channel)), + d_histogram[active_channel], num_levels[active_channel], + lower_level[active_channel], upper_level[active_channel], num_pixels); + } +} + +// Evenly Divided Multi-Channel Histogram of a 2-D ROI in a flattened 2-D array +template +::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +multi_histogram_even_roi(Policy &&policy, Iter1 d_samples, + Iter2 d_histogram[NumActiveChannels], + int num_levels[NumActiveChannels], + T lower_level[NumActiveChannels], + T upper_level[NumActiveChannels], + OffsetT num_row_samples, OffsetT num_rows, + ::std::size_t row_stride_bytes) { + for (int active_channel = 0; active_channel < NumActiveChannels; + active_channel++) { + histogram_even( + policy, + oneapi::dpl::permutation_iterator( + d_samples, + internal::__composition_functor( + internal::__roi_2d_index_functor( + num_row_samples, + row_stride_bytes / + (NumChannels * sizeof(typename ::std::iterator_traits< + Iter1>::value_type))), + internal::__interleaved_index_functor(NumChannels, + active_channel))), + d_histogram[active_channel], num_levels[active_channel], + lower_level[active_channel], upper_level[active_channel], + num_row_samples * num_rows); + } +} + +// Custom Range Histogram of a 1-D array +template +::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +histogram_range(Policy &&policy, Iter1 d_samples, Iter2 d_histogram, + int num_levels, Iter3 d_levels, Size num_samples) { + internal::dpl_histogram::histogram(::std::forward(policy), d_samples, + d_samples + num_samples, d_histogram, + d_levels, d_levels + num_levels); +} + +// Custom Range Histogram of a 2-D ROI in a flattened 2-D Array +template +::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +histogram_range_roi(Policy &&policy, Iter1 d_samples, Iter2 d_histogram, + int num_levels, Iter3 d_levels, OffsetT num_row_samples, + OffsetT num_rows, ::std::size_t row_stride_bytes) { + return histogram_range( + ::std::forward(policy), + oneapi::dpl::permutation_iterator( + d_samples, + internal::__roi_2d_index_functor( + num_row_samples, + row_stride_bytes / + sizeof(typename ::std::iterator_traits::value_type))), + d_histogram, num_levels, d_levels, num_row_samples * num_rows); +} + +// Custom Range Multi-Channel Histogram of a 1-D array +template +::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +multi_histogram_range(Policy &&policy, Iter1 d_samples, + Iter2 d_histogram[NumActiveChannels], + int num_levels[NumActiveChannels], + Iter3 d_levels[NumActiveChannels], Size num_pixels) { + for (int active_channel = 0; active_channel < NumActiveChannels; + active_channel++) { + histogram_range(policy, + oneapi::dpl::permutation_iterator( + d_samples, internal::__interleaved_index_functor( + NumChannels, active_channel)), + d_histogram[active_channel], num_levels[active_channel], + d_levels[active_channel], num_pixels); + } +} + +// Custom Range Multi-Channel Histogram of a 2-D ROI in a flattened 2-D array +template +::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +multi_histogram_range_roi(Policy &&policy, Iter1 d_samples, + Iter2 d_histogram[NumActiveChannels], + int num_levels[NumActiveChannels], + Iter3 d_levels[NumActiveChannels], + OffsetT num_row_samples, OffsetT num_rows, + ::std::size_t row_stride_bytes) { + for (int active_channel = 0; active_channel < NumActiveChannels; + active_channel++) { + histogram_range( + policy, + oneapi::dpl::permutation_iterator( + d_samples, + internal::__composition_functor( + internal::__roi_2d_index_functor( + num_row_samples, + row_stride_bytes / + (NumChannels * sizeof(typename ::std::iterator_traits< + Iter1>::value_type))), + internal::__interleaved_index_functor(NumChannels, + active_channel))), + d_histogram[active_channel], num_levels[active_channel], + d_levels[active_channel], num_row_samples * num_rows); + } +} + +template +inline ::std::enable_if_t::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value> +sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in, + Iter4 values_out, ::std::int64_t n, bool descending, int begin_bit, + int end_bit) { + internal::sort_pairs_impl(std::forward(policy), keys_in, keys_out, + values_in, values_out, n, descending, begin_bit, + end_bit); +} + +template +inline void sort_pairs( + Policy &&policy, io_iterator_pair &keys, + io_iterator_pair &values, ::std::int64_t n, bool descending = false, + bool do_swap_iters = false, int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + sort_pairs(::std::forward(policy), keys.first(), keys.second(), + values.first(), values.second(), n, descending, begin_bit, + end_bit); + if (do_swap_iters) { + keys.swap(); + values.swap(); + } +} + +template +inline ::std::enable_if_t::value && + dpct::internal::is_iterator::value> +sort_keys(Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n, + bool descending, int begin_bit, int end_bit) { + using key_t_value_t = typename ::std::iterator_traits::value_type; + + int clipped_begin_bit = ::std::max(begin_bit, 0); + int clipped_end_bit = + ::std::min((::std::uint64_t)end_bit, sizeof(key_t_value_t) * 8); + int num_bytes = (clipped_end_bit - clipped_begin_bit - 1) / 8 + 1; + + auto transform_and_sort_f = [&](auto x) { + using T = typename ::std::decay_t; + internal::transform_and_sort( + ::std::forward(policy), keys_in, keys_out, n, descending, + clipped_begin_bit, clipped_end_bit); + }; + + if (clipped_end_bit - clipped_begin_bit == sizeof(key_t_value_t) * 8) { + internal::sort_only(::std::forward(policy), keys_in, keys_out, n, + descending); + } else if (num_bytes == 1) { + transform_and_sort_f.template operator()(0); + } else if (num_bytes == 2) { + transform_and_sort_f.template operator()(0); + } else if (num_bytes <= 4) { + transform_and_sort_f.template operator()(0); + } else // if (num_bytes <= 8) + { + transform_and_sort_f.template operator()<::std::uint64_t>(0); + } +} + +template +inline void sort_keys( + Policy &&policy, io_iterator_pair &keys, ::std::int64_t n, + bool descending = false, bool do_swap_iters = false, int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + sort_keys(std::forward(policy), keys.first(), keys.second(), n, + descending, begin_bit, end_bit); + if (do_swap_iters) + keys.swap(); +} + +template +inline ::std::enable_if_t::value && + dpct::internal::is_iterator::value> +segmented_sort_keys( + Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n, + ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets, + bool descending = false, int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + int compute_units = + policy.queue() + .get_device() + .template get_info(); + auto sg_sizes = policy.queue() + .get_device() + .template get_info(); + int subgroup_size = sg_sizes.empty() ? 1 : sg_sizes.back(); + // parallel for of serial sorts when we have sufficient number of segments for + // load balance when number of segments is large as compared to our target + // compute capability + if (nsegments > + compute_units * + (policy.queue().get_device().is_gpu() ? subgroup_size : 1)) { + dpct::internal::segmented_sort_keys_by_parallel_for_of_sorts( + ::std::forward(policy), keys_in, keys_out, n, nsegments, + begin_offsets, end_offsets, descending, begin_bit, end_bit); + } else + { + dpct::internal::segmented_sort_keys_by_parallel_sorts( + ::std::forward(policy), keys_in, keys_out, n, nsegments, + begin_offsets, end_offsets, descending, begin_bit, end_bit); + } +} + +template +inline void segmented_sort_keys( + Policy &&policy, io_iterator_pair &keys, ::std::int64_t n, + ::std::int64_t nsegments, Iter2 begin_offsets, Iter2 end_offsets, + bool descending = false, bool do_swap_iters = false, int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + segmented_sort_keys(::std::forward(policy), keys.first(), + keys.second(), n, nsegments, begin_offsets, end_offsets, + descending, begin_bit, end_bit); + if (do_swap_iters) { + keys.swap(); + } +} + +template +inline ::std::enable_if_t::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value> +segmented_sort_pairs( + Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in, + Iter4 values_out, ::std::int64_t n, ::std::int64_t nsegments, + Iter5 begin_offsets, Iter5 end_offsets, bool descending = false, + int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + int compute_units = + policy.queue() + .get_device() + .template get_info(); + auto sg_sizes = policy.queue() + .get_device() + .template get_info(); + int subgroup_size = sg_sizes.empty() ? 1 : sg_sizes.back(); + // parallel for of serial sorts when we have sufficient number of segments for + // load balance when number of segments is large as compared to our target + // compute capability + if (nsegments > + compute_units * + (policy.queue().get_device().is_gpu() ? subgroup_size : 1)) { + dpct::internal::segmented_sort_pairs_by_parallel_for_of_sorts( + ::std::forward(policy), keys_in, keys_out, values_in, + values_out, n, nsegments, begin_offsets, end_offsets, descending, + begin_bit, end_bit); + } else + { + dpct::internal::segmented_sort_pairs_by_parallel_sorts( + ::std::forward(policy), keys_in, keys_out, values_in, + values_out, n, nsegments, begin_offsets, end_offsets, descending, + begin_bit, end_bit); + } +} + +template +inline void segmented_sort_pairs( + Policy &&policy, io_iterator_pair &keys, + io_iterator_pair &values, ::std::int64_t n, ::std::int64_t nsegments, + Iter3 begin_offsets, Iter3 end_offsets, bool descending = false, + bool do_swap_iters = false, int begin_bit = 0, + int end_bit = sizeof(typename ::std::iterator_traits::value_type) * + 8) { + segmented_sort_pairs(std::forward(policy), keys.first(), + keys.second(), values.first(), values.second(), n, + nsegments, begin_offsets, end_offsets, descending, + begin_bit, end_bit); + if (do_swap_iters) { + keys.swap(); + values.swap(); + } +} + +template +inline void reduce_argmax(Policy &&policy, Iter1 input, Iter2 output, + ::std::size_t n) { + dpct::arg_index_input_iterator input_arg_idx(input); + auto ret = ::std::max_element( + policy, input_arg_idx, input_arg_idx + n, + [](const auto &a, const auto &b) { return (a.value < b.value); }); + ::std::copy(::std::forward(policy), ret, ret + 1, output); +} + +template +inline void reduce_argmin(Policy &&policy, Iter1 input, Iter2 output, + ::std::size_t n) { + dpct::arg_index_input_iterator input_arg_idx(input); + auto ret = ::std::min_element( + policy, input_arg_idx, input_arg_idx + n, + [](const auto &a, const auto &b) { return (a.value < b.value); }); + ::std::copy(::std::forward(policy), ret, ret + 1, output); +} + +template +inline ::std::pair equal_range(Policy &&policy, Iter1 start, + Iter1 end, const ValueT &value, + CompT comp) { + ::std::vector<::std::int64_t> res_lower(1); + ::std::vector<::std::int64_t> res_upper(1); + ::std::vector value_vec(1, value); + ::oneapi::dpl::lower_bound(policy, start, end, value_vec.begin(), + value_vec.end(), res_lower.begin(), comp); + ::oneapi::dpl::upper_bound(::std::forward(policy), start, end, + value_vec.begin(), value_vec.end(), + res_upper.begin(), comp); + return ::std::make_pair(start + res_lower[0], start + res_upper[0]); +} + +template +inline ::std::pair equal_range(Policy &&policy, Iter1 start, + Iter1 end, const ValueT &value) { + return equal_range(::std::forward(policy), start, end, value, + internal::__less()); +} + +template +inline ::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +segmented_reduce_argmin(Policy &&policy, Iter1 keys_in, Iter2 keys_out, + ::std::int64_t nsegments, Iter3 begin_offsets, + Iter3 end_offsets) { + policy.queue().submit([&](sycl::handler &cgh) { + cgh.parallel_for(nsegments, [=](sycl::id<1> i) { + if (end_offsets[i] <= begin_offsets[i]) { + keys_out[i] = dpct::key_value_pair( + 1, ::std::numeric_limits< + typename ::std::iterator_traits::value_type>::max()); + } else { + dpct::arg_index_input_iterator arg_index(keys_in + + begin_offsets[i]); + keys_out[i] = *::std::min_element( + arg_index, arg_index + (end_offsets[i] - begin_offsets[i]), + [](const auto &a, const auto &b) { return a.value < b.value; }); + } + }); + }); + policy.queue().wait(); +} + +template +inline ::std::enable_if_t< + dpct::internal::is_iterator::value && + dpct::internal::is_iterator::value && + internal::is_hetero_execution_policy<::std::decay_t>::value> +segmented_reduce_argmax(Policy &&policy, Iter1 keys_in, Iter2 keys_out, + ::std::int64_t nsegments, Iter3 begin_offsets, + Iter3 end_offsets) { + policy.queue().submit([&](sycl::handler &cgh) { + cgh.parallel_for(nsegments, [=](sycl::id<1> i) { + if (end_offsets[i] <= begin_offsets[i]) { + keys_out[i] = dpct::key_value_pair( + 1, + ::std::numeric_limits< + typename ::std::iterator_traits::value_type>::lowest()); + } else { + dpct::arg_index_input_iterator arg_index(keys_in + + begin_offsets[i]); + keys_out[i] = *::std::max_element( + arg_index, arg_index + (end_offsets[i] - begin_offsets[i]), + [](const auto &a, const auto &b) { return a.value < b.value; }); + } + }); + }); + policy.queue().wait(); +} + +template +void nontrivial_run_length_encode(ExecutionPolicy &&policy, + InputIterator input_beg, + OutputIterator1 offsets_out, + OutputIterator2 lengths_out, + OutputIterator3 num_runs, + ::std::int64_t num_items) { + using oneapi::dpl::make_transform_iterator; + using oneapi::dpl::make_zip_iterator; + using offsets_t = + typename ::std::iterator_traits::value_type; + using lengths_t = + typename ::std::iterator_traits::value_type; + + auto input_end = input_beg + num_items; + // First element must be nontrivial run (start of first segment) + auto first_adj_it = oneapi::dpl::adjacent_find(policy, input_beg, input_end); + auto first_adj_idx = ::std::distance(input_beg, first_adj_it); + if (first_adj_it == input_end) { + ::std::fill(policy, num_runs, num_runs + 1, 0); + return; + } + auto get_prev_idx_element = [first_adj_idx](const auto &idx) { + auto out_idx = idx + first_adj_idx; + return (out_idx == 0) ? 0 : out_idx - 1; + }; + auto get_next_idx_element = [first_adj_idx, num_items](const auto &idx) { + auto out_idx = idx + first_adj_idx; + return (out_idx == num_items - 1) ? num_items - 1 : out_idx + 1; + }; + // TODO: Use shifted view to pad range once oneDPL ranges is non-experimental + auto left_shifted_input_beg = + oneapi::dpl::make_permutation_iterator(input_beg, get_prev_idx_element); + auto right_shifted_input_beg = + oneapi::dpl::make_permutation_iterator(input_beg, get_next_idx_element); + // Segment type for ith idx consists of zip of iterators at (i-1, i, i+1) + // padded at the ends + auto zipped_keys_beg = make_zip_iterator( + left_shifted_input_beg, input_beg, right_shifted_input_beg, + oneapi::dpl::counting_iterator(0)); + // Set flag at the beginning of new nontrivial run (ex: (2, 3, 3) -> 1) + auto key_flags_beg = + make_transform_iterator(zipped_keys_beg, [num_items](const auto &zipped) { + using ::std::get; + bool last_idx_mask = get<3>(zipped) != num_items - 1; + return (get<0>(zipped) != get<1>(zipped) && + get<1>(zipped) == get<2>(zipped)) && + last_idx_mask; + }); + auto count_beg = oneapi::dpl::counting_iterator(0); + auto const_it = dpct::make_constant_iterator(lengths_t(1)); + // Check for presence of nontrivial element at current index + auto tr_nontrivial_flags = make_transform_iterator( + make_zip_iterator(left_shifted_input_beg, input_beg), + [](const auto &zip) { + using ::std::get; + return get<0>(zip) == get<1>(zip); + }); + auto zipped_vals_beg = + make_zip_iterator(tr_nontrivial_flags, count_beg, const_it); + auto pred = [](bool lhs, bool rhs) { return !rhs; }; + auto op = [](auto lhs, const auto &rhs) { + using ::std::get; + + // Update length count of run. + // The first call to this op will use the first element of the input as lhs + // and second element as rhs. get<0>(first_element) is ignored in favor of a + // constant `1` in get<2>, avoiding the need for special casing the first + // element. The constant `1` utilizes the knowledge that each segment begins + // with a nontrivial run. + get<2>(lhs) += get<0>(rhs); + + // A run's starting index is stored in get<1>(lhs) as the initial value in + // the segment and is preserved throughout the segment's reduction as the + // nontrivial run's offset. + + return ::std::move(lhs); + }; + auto zipped_out_beg = make_zip_iterator(oneapi::dpl::discard_iterator(), + offsets_out, lengths_out); + auto [_, zipped_out_vals_end] = oneapi::dpl::reduce_by_segment( + policy, key_flags_beg + first_adj_idx, key_flags_beg + num_items, + zipped_vals_beg + first_adj_idx, oneapi::dpl::discard_iterator(), + zipped_out_beg, pred, op); + auto ret_dist = ::std::distance(zipped_out_beg, zipped_out_vals_end); + ::std::fill(policy, num_runs, num_runs + 1, ret_dist); +} + +} // end namespace dpct + +#endif diff --git a/dpct/dpl_extras/dpcpp_extensions.h b/dpct/dpl_extras/dpcpp_extensions.h new file mode 100644 index 0000000000000..05a0068e65925 --- /dev/null +++ b/dpct/dpl_extras/dpcpp_extensions.h @@ -0,0 +1,747 @@ +//==---- dpcpp_extensions.h ------------------*- C++ -*---------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------===// + +#ifndef __DPCT_DPCPP_EXTENSIONS_H__ +#define __DPCT_DPCPP_EXTENSIONS_H__ + +#include +#include + +#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS +#include +#endif + +#include "../dpct.hpp" +#include "functional.h" + +namespace dpct { +namespace group { +namespace detail { + +template +constexpr auto __reduce_over_group(_Args... __args) { + return sycl::reduce_over_group(__args...); +} + +template constexpr auto __group_broadcast(_Args... __args) { + return sycl::group_broadcast(__args...); +} + +template +constexpr auto __exclusive_scan_over_group(_Args... __args) { + return sycl::exclusive_scan_over_group(__args...); +} + +template +constexpr auto __inclusive_scan_over_group(_Args... __args) { + return sycl::inclusive_scan_over_group(__args...); +} + +} // end namespace detail + +/// Perform an exclusive scan over the values of inputs from all work-items in +/// the group using the operator binary_op, which must be one of the SYCL 2020 +/// group algorithms library function objects. +/// +/// \param item A work-item in a group. +/// \param inputs Pointer to the input data for the scan operation. +/// \param outputs Pointer to the location where scan results will be stored. +/// \param init initial value of the scan result. +/// \param binary_op functor that implements the binary operation used to +/// perform the scan. +template +__dpct_inline__ void +exclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD], + T (&outputs)[VALUES_PER_THREAD], T init, + BinaryOperation binary_op) { + T result = inputs[0]; + +#pragma unroll + for (int i = 1; i < VALUES_PER_THREAD; ++i) { + result = binary_op(result, inputs[i]); + } + + T exclusive_result = + detail::__exclusive_scan_over_group(item.get_group(), result, binary_op); + + T input = inputs[0]; + if (item.get_local_linear_id() == 0) { + outputs[0] = init; + } else { + outputs[0] = exclusive_result; + } + +#pragma unroll + for (int i = 1; i < VALUES_PER_THREAD; ++i) { + T output = binary_op(input, outputs[i - 1]); + input = inputs[i]; + outputs[i] = output; + } +} + +/// Perform an exclusive scan over the values of input from all work-items in +/// the group using the operator binary_op, which must be one of the SYCL 2020 +/// group algorithms library function objects. +/// +/// \param item A work-item in a group. +/// \param input Input data for the scan operation. +/// \param init initial value of the scan result. +/// \param binary_op functor that implements the binary operation used to +/// perform the scan. \param group_aggregate group-wide aggregate of all inputs +/// in the work-items of the group. \returns exclusive scan of the first i +/// work-items where item is the i-th work item. +template +__dpct_inline__ T +exclusive_scan(const Item &item, T input, T init, BinaryOperation binary_op, + T &group_aggregate) { + T output = detail::__exclusive_scan_over_group(item.get_group(), input, init, + binary_op); + if (item.get_local_linear_id() == item.get_local_range().size() - 1) { + group_aggregate = binary_op(output, input); + } + + group_aggregate = detail::__group_broadcast( + item.get_group(), group_aggregate, item.get_local_range().size() - 1); + return output; +} + +/// Perform an exclusive scan over the values of input from all work-items in +/// the group using the operator binary_op, which must be one of the SYCL 2020 +/// group algorithms library function objects. +/// +/// \param item A work-item in a group. +/// \param input Input data for the scan operation. +/// \param binary_op functor that implements the binary operation used to +/// perform the scan. \param prefix_callback_op functor invoked by the first +/// work-item in the group that returns the +/// initial value in the resulting scan of the work-items in the group. +/// \returns exclusive scan of the input elements assigned to work-items in the +/// group. +template +__dpct_inline__ T +exclusive_scan(const Item &item, T input, BinaryOperation binary_op, + GroupPrefixCallbackOperation &prefix_callback_op) { + T group_aggregate; + + T output = + detail::__exclusive_scan_over_group(item.get_group(), input, binary_op); + if (item.get_local_linear_id() == item.get_local_range().size() - 1) { + group_aggregate = binary_op(output, input); + } + + group_aggregate = detail::__group_broadcast( + item.get_group(), group_aggregate, item.get_local_range().size() - 1); + + T group_prefix = prefix_callback_op(group_aggregate); + if (item.get_local_linear_id() == 0) { + output = group_prefix; + } else { + output = binary_op(group_prefix, output); + } + + return output; +} + +namespace detail { + +typedef uint16_t digit_counter_type; +typedef uint32_t packed_counter_type; + +template struct log2 { + enum { VALUE = log2> 1), COUNT + 1>::VALUE }; +}; + +template struct log2 { + enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 }; +}; + +template class radix_rank { +public: + static size_t get_local_memory_size(size_t group_threads) { + return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type); + } + + radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {} + + template + __dpct_inline__ void + rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD], + int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) { + + digit_counter_type thread_prefixes[VALUES_PER_THREAD]; + digit_counter_type *digit_counters[VALUES_PER_THREAD]; + digit_counter_type *buffer = + reinterpret_cast(_local_memory); + + reset_local_memory(item); + + item.barrier(sycl::access::fence_space::local_space); + +#pragma unroll + for (int i = 0; i < VALUES_PER_THREAD; ++i) { + uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits); + uint32_t sub_counter = digit >> LOG_COUNTER_LANES; + uint32_t counter_lane = digit & (COUNTER_LANES - 1); + + if (DESCENDING) { + sub_counter = PACKING_RATIO - 1 - sub_counter; + counter_lane = COUNTER_LANES - 1 - counter_lane; + } + + digit_counters[i] = + &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO + + item.get_local_linear_id() * PACKING_RATIO + sub_counter]; + thread_prefixes[i] = *digit_counters[i]; + *digit_counters[i] = thread_prefixes[i] + 1; + } + + item.barrier(sycl::access::fence_space::local_space); + + scan_counters(item); + + item.barrier(sycl::access::fence_space::local_space); + + for (int i = 0; i < VALUES_PER_THREAD; ++i) { + ranks[i] = thread_prefixes[i] + *digit_counters[i]; + } + } + +private: + template + __dpct_inline__ void reset_local_memory(const Item &item) { + packed_counter_type *ptr = + reinterpret_cast(_local_memory); + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { + ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0; + } + } + + template + __dpct_inline__ packed_counter_type upsweep(const Item &item) { + packed_counter_type sum = 0; + packed_counter_type *ptr = + reinterpret_cast(_local_memory); + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; i++) { + cached_segment[i] = + ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i]; + } + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { + sum += cached_segment[i]; + } + + return sum; + } + + template + __dpct_inline__ void + exclusive_downsweep(const Item &item, packed_counter_type raking_partial) { + packed_counter_type *ptr = + reinterpret_cast(_local_memory); + packed_counter_type sum = raking_partial; + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { + packed_counter_type value = cached_segment[i]; + cached_segment[i] = sum; + sum += value; + } + +#pragma unroll + for (int i = 0; i < PADDED_COUNTER_LANES; ++i) { + ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] = + cached_segment[i]; + } + } + + struct prefix_callback { + __dpct_inline__ packed_counter_type + operator()(packed_counter_type block_aggregate) { + packed_counter_type block_prefix = 0; + +#pragma unroll + for (int packed = 1; packed < PACKING_RATIO; packed++) { + block_prefix += block_aggregate + << (sizeof(digit_counter_type) * 8 * packed); + } + + return block_prefix; + } + }; + + template + __dpct_inline__ void scan_counters(const Item &item) { + packed_counter_type raking_partial = upsweep(item); + + prefix_callback callback; + packed_counter_type exclusive_partial = exclusive_scan( + item, raking_partial, sycl::ext::oneapi::plus(), + callback); + + exclusive_downsweep(item, exclusive_partial); + } + +private: + static constexpr int PACKING_RATIO = + sizeof(packed_counter_type) / sizeof(digit_counter_type); + static constexpr int LOG_PACKING_RATIO = log2::VALUE; + static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO; + static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES; + static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1; + + packed_counter_type cached_segment[PADDED_COUNTER_LANES]; + uint8_t *_local_memory; +}; + +template struct base_traits { + + static __dpct_inline__ U twiddle_in(U key) { + throw std::runtime_error("Not implemented"); + } + static __dpct_inline__ U twiddle_out(U key) { + throw std::runtime_error("Not implemented"); + } +}; + +template struct base_traits { + static __dpct_inline__ U twiddle_in(U key) { return key; } + static __dpct_inline__ U twiddle_out(U key) { return key; } +}; + +template struct base_traits { + static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); + static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; } + static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; } +}; + +template struct base_traits { + static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1); + static __dpct_inline__ U twiddle_in(U key) { + U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT; + return key ^ mask; + } + static __dpct_inline__ U twiddle_out(U key) { + U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1); + return key ^ mask; + } +}; + +template struct traits : base_traits {}; +template <> struct traits : base_traits {}; +template <> struct traits : base_traits {}; +template <> struct traits : base_traits {}; + +} // namespace detail + +namespace detail { + +template struct power_of_two { + enum { VALUE = ((N & (N - 1)) == 0) }; +}; + +__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) { + return (x >> shift) + addend; +} + +} // namespace detail + +/// Implements scatter to blocked exchange pattern used in radix sort algorithm. +/// +/// \tparam T type of the data elements exchanges +/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread +template class exchange { +public: + static size_t get_local_memory_size(size_t group_threads) { + size_t padding_values = + (INSERT_PADDING) + ? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS) + : 0; + return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T); + } + + exchange(uint8_t *local_memory) : _local_memory(local_memory) {} + + /// Rearrange elements from rank order to blocked order + template + __dpct_inline__ void + scatter_to_blocked(Item item, T (&keys)[VALUES_PER_THREAD], + int (&ranks)[VALUES_PER_THREAD]) { + T *buffer = reinterpret_cast(_local_memory); + +#pragma unroll + for (int i = 0; i < VALUES_PER_THREAD; i++) { + int offset = ranks[i]; + if (INSERT_PADDING) + offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset); + buffer[offset] = keys[i]; + } + + item.barrier(sycl::access::fence_space::local_space); + +#pragma unroll + for (int i = 0; i < VALUES_PER_THREAD; i++) { + int offset = (item.get_local_id(0) * VALUES_PER_THREAD) + i; + if (INSERT_PADDING) + offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset); + keys[i] = buffer[offset]; + } + } + +private: + static constexpr int LOG_LOCAL_MEMORY_BANKS = 5; + static constexpr bool INSERT_PADDING = + (VALUES_PER_THREAD > 4) && + (detail::power_of_two::VALUE); + + uint8_t *_local_memory; +}; + +/// Implements radix sort to sort integer data elements assigned to all threads +/// in the group. +/// +/// \tparam T type of the data elements exchanges +/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread +/// \tparam DECENDING boolean value indicating if data elements are sorted in +/// decending order. +template +class radix_sort { +public: + static size_t get_local_memory_size(size_t group_threads) { + size_t ranks_size = + detail::radix_rank::get_local_memory_size(group_threads); + size_t exchange_size = + exchange::get_local_memory_size(group_threads); + return sycl::max(ranks_size, exchange_size); + } + + radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {} + + template + __dpct_inline__ void + sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0, + int end_bit = 8 * sizeof(T)) { + + uint32_t(&unsigned_keys)[VALUES_PER_THREAD] = + reinterpret_cast(keys); + +#pragma unroll + for (int i = 0; i < VALUES_PER_THREAD; ++i) { + unsigned_keys[i] = detail::traits::twiddle_in(unsigned_keys[i]); + } + + while (true) { + int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit); + + int ranks[VALUES_PER_THREAD]; + detail::radix_rank(_local_memory) + .template rank_keys(item, unsigned_keys, ranks, begin_bit, pass_bits); + begin_bit += RADIX_BITS; + + item.barrier(sycl::access::fence_space::local_space); + + exchange(_local_memory) + .scatter_to_blocked(item, keys, ranks); + + item.barrier(sycl::access::fence_space::local_space); + + if (begin_bit >= end_bit) + break; + } + +#pragma unroll + for (int i = 0; i < VALUES_PER_THREAD; ++i) { + unsigned_keys[i] = detail::traits::twiddle_out(unsigned_keys[i]); + } + } + +private: + static constexpr int RADIX_BITS = 4; + + uint8_t *_local_memory; +}; + +/// Perform a reduction of the data elements assigned to all threads in the +/// group. +/// +/// \param item A work-item in a group. +/// \param inputs Pointer to the input data for the reduce operation. +/// \param binary_op functor that implements the binary operation used to +/// perform the scan. \returns value of the reduction using binary_op +template +__dpct_inline__ T +reduce(Item item, T (&inputs)[VALUES_PER_THREAD], BinaryOperation binary_op) { + T result = inputs[0]; + +#pragma unroll + for (int i = 1; i < VALUES_PER_THREAD; i++) { + result = binary_op(result, inputs[i]); + } + return detail::__reduce_over_group(item.get_group(), result, binary_op); +} + +/// Perform a reduction on a limited number of the work items in a subgroup +/// +/// \param item A work-item in a group. +/// \param value value per work item which is to be reduced +/// \param items_to_reduce num work items at the start of the subgroup to reduce +/// \param binary_op functor that implements the binary operation used to +/// perform the scan. \returns value of the reduction using binary_op +template +__dpct_inline__ +typename ::std::enable_if_t, T> +reduce_over_partial_group(const Item &item, const T &value, + const ::std::uint16_t &items_to_reduce, + BinaryOperation binary_op) { + T value_temp = (item.get_local_linear_id() < items_to_reduce) + ? value + : sycl::known_identity_v; + return detail::__reduce_over_group(item.get_sub_group(), value_temp, + binary_op); +} + +/// Perform an inclusive scan over the values of inputs from all work-items in +/// the group using the operator binary_op, which must be one of the SYCL 2020 +/// group algorithms library function objects. +/// +/// \param item A work-item in a group. +/// \param inputs Pointer to the input data for the scan operation. +/// \param outputs Pointer to the location where scan results will be stored. +/// \param binary_op functor that implements the binary operation used to +/// perform the scan. \returns inclusive scan of the input elements assigned to +/// work-items in the group. +template +__dpct_inline__ void +inclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD], + T (&outputs)[VALUES_PER_THREAD], BinaryOperation binary_op) { + T result = inputs[0]; + +#pragma unroll + for (int i = 1; i < VALUES_PER_THREAD; ++i) { + result = binary_op(result, inputs[i]); + } + + T exclusive_result = + detail::__exclusive_scan_over_group(item.get_group(), result, binary_op); + + if (item.get_local_linear_id() == 0) { + outputs[0] = inputs[0]; + } else { + outputs[0] = binary_op(inputs[0], exclusive_result); + } + +#pragma unroll + for (int i = 1; i < VALUES_PER_THREAD; ++i) { + outputs[i] = binary_op(inputs[i], outputs[i - 1]); + } +} + +/// Perform an inclusive scan over the values of inputs from all work-items in +/// the group using the operator binary_op, which must be one of the SYCL 2020 +/// group algorithms library function objects. +/// +/// \param item A work-item in a group. +/// \param input Pointer to the input data for the scan operation. +/// \param binary_op functor that implements the binary operation used to +/// perform the scan. \param group_aggregate group-wide aggregate of all inputs +/// in the work-items of the group. \returns inclusive scan of the input +/// elements assigned to work-items in the group. +template +__dpct_inline__ T inclusive_scan(const Item &item, T input, + BinaryOperation binary_op, + T &group_aggregate) { + T output = + detail::__inclusive_scan_over_group(item.get_group(), input, binary_op); + if (item.get_local_linear_id() == item.get_local_range().size() - 1) { + group_aggregate = output; + } + + group_aggregate = detail::__group_broadcast( + item.get_group(), group_aggregate, item.get_local_range().size() - 1); + return output; +} + +/// Perform an inclusive scan over the values of input from all work-items in +/// the group using the operator binary_op, which must be one of the SYCL 2020 +/// group algorithms library function objects. +/// +/// \param item A work-item in a group. +/// \param input Input data for the scan operation. +/// \param binary_op functor that implements the binary operation used to +/// perform the scan. \param prefix_callback_op functor invoked by the first +/// work-item in the group that returns the +/// initial value in the resulting scan of the work-items in the group. +/// \returns inclusive scan of the input elements assigned to work-items in the +/// group. +template +__dpct_inline__ T +inclusive_scan(const Item &item, T input, BinaryOperation binary_op, + GroupPrefixCallbackOperation &prefix_callback_op) { + T group_aggregate; + + T output = inclusive_scan(item, input, binary_op, group_aggregate); + T group_prefix = prefix_callback_op(group_aggregate); + + return binary_op(group_prefix, output); +} + +} // namespace group + +namespace device { + +namespace detail { + +template constexpr auto __joint_reduce(_Args... __args) { + return sycl::joint_reduce(__args...); +} + +} // namespace detail + +/// Perform a reduce on each of the segments specified within data stored on +/// the device. +/// +/// \param queue Command queue used to access device used for reduction +/// \param inputs Pointer to the data elements on the device to be reduced +/// \param outputs Pointer to the storage where the reduced value for each +/// segment will be stored \param segment_count number of segments to be reduced +/// \param begin_offsets Pointer to the set of indices that are the first +/// element in each segment \param end_offsets Pointer to the set of indices +/// that are one past the last element in each segment \param binary_op functor +/// that implements the binary operation used to perform the scan. \param init +/// initial value of the reduction for each segment. +template +void segmented_reduce(sycl::queue queue, T *inputs, T *outputs, + size_t segment_count, OffsetT *begin_offsets, + OffsetT *end_offsets, BinaryOperation binary_op, T init) { + + sycl::range<1> global_size(segment_count * GROUP_SIZE); + sycl::range<1> local_size(GROUP_SIZE); + + queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<1>(global_size, local_size), [=](sycl::nd_item<1> item) { + OffsetT segment_begin = begin_offsets[item.get_group_linear_id()]; + OffsetT segment_end = end_offsets[item.get_group_linear_id()]; + if (segment_begin == segment_end) { + if (item.get_local_linear_id() == 0) { + outputs[item.get_group_linear_id()] = init; + } + return; + } + + sycl::multi_ptr + input_ptr = inputs; + T group_aggregate = detail::__joint_reduce( + item.get_group(), input_ptr + segment_begin, + input_ptr + segment_end, init, binary_op); + + if (item.get_local_linear_id() == 0) { + outputs[item.get_group_linear_id()] = group_aggregate; + } + }); + }); +} + + +#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS + +namespace experimental { +namespace detail { +template struct __is_any { + constexpr static bool value = std::disjunction_v< + std::is_same, std::remove_cv_t<_Ts>>...>; +}; + +template struct __in_native_op_list { + constexpr static bool value = + __is_any<_Bp, sycl::plus<_Tp>, sycl::bit_or<_Tp>, sycl::bit_xor<_Tp>, + sycl::bit_and<_Tp>, sycl::maximum<_Tp>, sycl::minimum<_Tp>, + sycl::multiplies<_Tp>>::value; +}; + +template struct __is_native_op { + constexpr static bool value = __in_native_op_list<_Tp, _Bp>::value || + __in_native_op_list::value; +}; + +} // namespace detail + +/// Perform a reduce on each of the segments specified within data stored on +/// the device. Compared with dpct::device::segmented_reduce, this experimental +/// feature support user define reductions. +/// +/// \param queue Command queue used to access device used for reduction +/// \param inputs Pointer to the data elements on the device to be reduced +/// \param outputs Pointer to the storage where the reduced value for each +/// segment will be stored \param segment_count number of segments to be reduced +/// \param begin_offsets Pointer to the set of indices that are the first +/// element in each segment \param end_offsets Pointer to the set of indices +/// that are one past the last element in each segment \param binary_op functor +/// that implements the binary operation used to perform the scan. \param init +/// initial value of the reduction for each segment. +template +void segmented_reduce(sycl::queue queue, T *inputs, T *outputs, + size_t segment_count, OffsetT *begin_offsets, + OffsetT *end_offsets, BinaryOperation binary_op, T init) { + + sycl::range<1> global_size(segment_count * GROUP_SIZE); + sycl::range<1> local_size(GROUP_SIZE); + + if constexpr (!detail::__is_native_op::value) { + queue.submit([&](sycl::handler &cgh) { + size_t temp_memory_size = GROUP_SIZE * sizeof(T); + auto scratch = sycl::local_accessor(temp_memory_size, cgh); + cgh.parallel_for( + sycl::nd_range<1>(global_size, local_size), + [=](sycl::nd_item<1> item) { + OffsetT segment_begin = begin_offsets[item.get_group_linear_id()]; + OffsetT segment_end = end_offsets[item.get_group_linear_id()]; + if (segment_begin == segment_end) { + if (item.get_local_linear_id() == 0) { + outputs[item.get_group_linear_id()] = init; + } + return; + } + // Create a handle that associates the group with an allocation it + // can use + auto handle = + sycl::ext::oneapi::experimental::group_with_scratchpad( + item.get_group(), + sycl::span(&scratch[0], temp_memory_size)); + T group_aggregate = sycl::ext::oneapi::experimental::joint_reduce( + handle, inputs + segment_begin, inputs + segment_end, init, + binary_op); + if (item.get_local_linear_id() == 0) { + outputs[item.get_group_linear_id()] = group_aggregate; + } + }); + }); + } else { + dpct::device::segmented_reduce(queue, inputs, outputs, + segment_count, begin_offsets, + end_offsets, binary_op, init); + } +} +} // namespace experimental + +#endif // SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS + + +} // namespace device +} // namespace dpct + +#endif diff --git a/dpct/dpl_extras/functional.h b/dpct/dpl_extras/functional.h new file mode 100644 index 0000000000000..bab82814c2103 --- /dev/null +++ b/dpct/dpl_extras/functional.h @@ -0,0 +1,453 @@ +//==---- functional.h -----------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_FUNCTIONAL_H__ +#define __DPCT_FUNCTIONAL_H__ + +#include +#include +#include + +#if ONEDPL_USE_DPCPP_BACKEND +#include +#endif + +#include +#include + +#include "../dpct.hpp" +#define _DPCT_GCC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) + +// Portability "#pragma" definition +#ifdef _MSC_VER +#define _DPCT_PRAGMA(x) __pragma(x) +#else +#define _DPCT_PRAGMA(x) _Pragma(#x) +#endif + +// Enable loop unrolling pragmas where supported +#if (__INTEL_COMPILER || \ + (!defined(__INTEL_COMPILER) && _DPCT_GCC_VERSION >= 80000)) +#define _DPCT_PRAGMA_UNROLL _DPCT_PRAGMA(unroll) +#else // no pragma unroll +#define _DPCT_PRAGMA_UNROLL +#endif + +namespace dpct { + +struct null_type {}; + +// Function object to wrap user defined functors to provide compile time "const" +// workaround for user function objects. +// The SYCL spec (4.12) states that writing to a function object during a SYCL +// kernel is undefined behavior. This wrapper is provided as a compile-time +// work around, but functors used in SYCL kernels must be `const` in practice. +template struct mark_functor_const { + mutable _Op op; + mark_functor_const() : op() {} + mark_functor_const(const _Op &__op) : op(__op) {} + mark_functor_const(_Op &&__op) : op(::std::move(__op)) {} + template auto operator()(_T &&...x) const { + return op(std::forward<_T>(x)...); + } +}; + +namespace internal { + +template +using enable_if_execution_policy = + typename std::enable_if::type>::value, + _T>::type; + +template struct is_hetero_execution_policy : ::std::false_type {}; + +template +struct is_hetero_execution_policy< + oneapi::dpl::execution::device_policy> : ::std::true_type { +}; + +template struct is_fpga_execution_policy : ::std::false_type {}; + +#if _ONEDPL_FPGA_DEVICE +template +struct is_hetero_execution_policy< + execution::fpga_policy> : ::std::true_type { +}; +#endif + +template +using enable_if_hetero_execution_policy = typename std::enable_if< + is_hetero_execution_policy::type>::value, + _T>::type; + +#if _ONEDPL_CPP14_INTEGER_SEQUENCE_PRESENT + +template +using index_sequence = std::index_sequence<_Sp...>; +template +using make_index_sequence = std::make_index_sequence<_Np>; + +#else + +template class index_sequence {}; + +template +struct make_index_sequence_impl + : make_index_sequence_impl<_Np - 1, _Np - 1, _Sp...> {}; + +template struct make_index_sequence_impl<0, _Sp...> { + using type = index_sequence<_Sp...>; +}; + +template +using make_index_sequence = typename make_index_sequence_impl<_Np>::type; +#endif + +// Minimal buffer implementations for temporary storage in mapping rules +// Some of our algorithms need to start with raw memory buffer, +// not an initialized array, because initialization/destruction +// would make the span be at least O(N). +#if ONEDPL_USE_DPCPP_BACKEND +template class __buffer { + sycl::buffer<_Tp, 1> __buf; + + __buffer(const __buffer &) = delete; + + void operator=(const __buffer &) = delete; + +public: + // Try to obtain buffer of given size to store objects of _Tp type + __buffer(std::size_t __n) : __buf(sycl::range<1>(__n)) {} + + // Return pointer to buffer, or NULL if buffer could not be obtained. + auto get() -> decltype(oneapi::dpl::begin(__buf)) const { + return oneapi::dpl::begin(__buf); + } +}; +#else +template class __buffer { + std::unique_ptr<_Tp> _M_ptr; + + __buffer(const __buffer &) = delete; + + void operator=(const __buffer &) = delete; + +public: + // Try to obtain buffer of given size to store objects of _Tp type + __buffer(const std::size_t __n) : _M_ptr(new _Tp[__n]) {} + + // Return pointer to buffer, or NULL if buffer could not be obtained. + _Tp *get() const { return _M_ptr.get(); } +}; +#endif + +// Implements C++14 std::less specialization to allow parameter type +// deduction. +class __less { +public: + template + bool operator()(_Xp &&__x, _Yp &&__y) const { + return std::forward<_Xp>(__x) < std::forward<_Yp>(__y); + } +}; + +template struct rebind_policy { + using type = Policy; +}; + +template +struct rebind_policy, + NewName> { + using type = oneapi::dpl::execution::device_policy; +}; + +#if _ONEDPL_FPGA_DEVICE +template +struct rebind_policy, + NewName> { + using type = oneapi::dpl::execution::fpga_policy; +}; +#endif + +template ::reference, + typename R2 = typename std::iterator_traits::reference> +struct perm_fun { + typedef R2 result_of; + perm_fun(T1 input) : source(input) {} + + R2 operator()(R1 x) const { return *(source + x); } + +private: + T1 source; +}; + +// Functor compares first element (key) from tied sequence. +template struct compare_key_fun { + typedef bool result_of; + compare_key_fun(Compare _comp = internal::__less()) : comp(_comp) {} + + template + result_of operator()(_T1 &&a, _T2 &&b) const { + using std::get; + return comp(get<0>(a), get<0>(b)); + } + +private: + mutable Compare comp; +}; + +// Functor evaluates second element of tied sequence with predicate. +// Used by: copy_if, remove_copy_if, stable_partition_copy +// Lambda: +template struct predicate_key_fun { + typedef bool result_of; + predicate_key_fun(Predicate _pred) : pred(_pred) {} + + template result_of operator()(_T1 &&a) const { + using std::get; + return pred(get<1>(a)); + } + +private: + mutable Predicate pred; +}; + +// Used by: remove_if +template struct negate_predicate_key_fun { + typedef bool result_of; + negate_predicate_key_fun(Predicate _pred) : pred(_pred) {} + + template result_of operator()(_T1 &&a) const { + using std::get; + return !pred(get<1>(a)); + } + +private: + mutable Predicate pred; +}; + +template struct sequence_fun { + using result_type = T; + sequence_fun(T _init, T _step) : init(_init), step(_step) {} + + template result_type operator()(_T &&i) const { + return static_cast(init + step * i); + } + +private: + const T init; + const T step; +}; + +//[binary_pred](Ref a, Ref b){ return(binary_pred(get<0>(a),get<0>(b))); +template struct unique_fun { + typedef bool result_of; + unique_fun(Predicate _pred) : pred(_pred) {} + template result_of operator()(_T &&a, _T &&b) const { + using std::get; + return pred(get<0>(a), get<0>(b)); + } + +private: + mutable Predicate pred; +}; + +// Lambda: [pred, &new_value](Ref1 a, Ref2 s) {return pred(s) ? new_value : a; +// }); +template struct replace_if_fun { +public: + typedef T result_of; + replace_if_fun(Predicate _pred, T _new_value) + : pred(_pred), new_value(_new_value) {} + + template T operator()(_T1 &&a, _T2 &&s) const { + return pred(s) ? new_value : a; + } + +private: + mutable Predicate pred; + const T new_value; +}; + +//[pred,op](Ref a){return pred(a) ? op(a) : a; } +template +struct transform_if_fun { + transform_if_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {} + template + void operator()(_T&& t) const { + using std::get; + if (pred(get<0>(t))) + get<1>(t) = op(get<0>(t)); + } + +private: + mutable Predicate pred; + mutable Operator op; +}; + +//[pred, op](Ref1 a, Ref2 s) { return pred(s) ? op(a) : a; }); +template +struct transform_if_unary_zip_mask_fun { + transform_if_unary_zip_mask_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {} + template + void operator()(_T&& t) const { + using std::get; + if (pred(get<1>(t))) + get<2>(t) = op(get<0>(t)); + } + +private: + mutable Predicate pred; + mutable Operator op; +}; + +template +class transform_if_zip_mask_fun { +public: + transform_if_zip_mask_fun(Predicate _pred = oneapi::dpl::identity(), + BinaryOperation _op = oneapi::dpl::identity()) + : pred(_pred), op(_op) {} + template void operator()(_T &&t) const { + using std::get; + if (pred(get<2>(t))) + get<3>(t) = op(get<0>(t), get<1>(t)); + } + +private: + mutable Predicate pred; + mutable BinaryOperation op; +}; + +// This following code is similar to a section of code in +// oneDPL/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +// It has a similar approach, and could be consolidated. +// Outside of some differences in approach, there are two significant +// differences in function. +// +// 1) This code allows the output type of the bit range translation to be fit +// into to the minimal type required to provide that many bits. The code in +// oneDPL to calculate the bucket for the radix is similar but its output is +// always std::uint32_t. The assumption that the bit range desired will fit in +// 32 bits is not true for this code. +// +// 2) This code ensures that for floating point type, -0.0f and 0.0f map to the +// same value. This allows the output of this translation to be used to provide +// a sort which ensures the stability of these values for floating point types. + +template struct uint_byte_map {}; +template <> struct uint_byte_map<1> { using type = uint8_t; }; +template <> struct uint_byte_map<2> { using type = uint16_t; }; +template <> struct uint_byte_map<4> { using type = uint32_t; }; +template <> struct uint_byte_map<8> { using type = uint64_t; }; + +template struct uint_map { + using type = typename uint_byte_map::type; +}; + +template class translate_key { + using uint_type_t = typename uint_map::type; + +public: + translate_key(int begin_bit, int end_bit) { + shift = begin_bit; + mask = ~OutKeyT(0); // all ones + mask = mask >> (sizeof(OutKeyT) * 8 - + (end_bit - begin_bit)); // setup appropriate mask + flip_sign = uint_type_t(1) << (sizeof(uint_type_t) * 8 - 1); // sign bit + flip_key = ~uint_type_t(0); // 0xF...F + } + + inline OutKeyT operator()(const T &key) const { + uint_type_t intermediate; + if constexpr (std::is_floating_point::value) { + // normal case (both -0.0f and 0.0f equal -0.0f) + if (key != T(-0.0f)) { + uint_type_t is_negative = reinterpret_cast(key) >> + (sizeof(uint_type_t) * 8 - 1); + intermediate = reinterpret_cast(key) ^ + ((is_negative * flip_key) | flip_sign); + } else // special case for -0.0f to keep stability with 0.0f + { + T negzero = T(-0.0f); + intermediate = reinterpret_cast(negzero); + } + } else if constexpr (std::is_signed::value) { + intermediate = reinterpret_cast(key) ^ flip_sign; + } else { + intermediate = key; + } + + return static_cast(intermediate >> shift) & + mask; // shift, cast, and mask + } + +private: + uint8_t shift; + OutKeyT mask; + uint_type_t flip_sign; + uint_type_t flip_key; +}; + +// Unary operator that returns reference to its argument. Ported from +// oneDPL: oneapi/dpl/pstl/utils.h +struct no_op_fun { + template Tp &&operator()(Tp &&a) const { + return ::std::forward(a); + } +}; + +// Unary functor which composes a pair of functors by calling them in succession +// on an input +template +struct __composition_functor { + __composition_functor(FunctorInner in, FunctorOuter out) + : _in(in), _out(out) {} + template T operator()(const T &i) const { + return _out(_in(i)); + } + FunctorInner _in; + FunctorOuter _out; +}; + +// Unary functor which maps an index of a ROI into a 2D flattened array +template struct __roi_2d_index_functor { + __roi_2d_index_functor(const OffsetT &num_cols, + const ::std::size_t &row_stride) + : _num_cols(num_cols), _row_stride(row_stride) {} + + template Index operator()(const Index &i) const { + return _row_stride * (i / _num_cols) + (i % _num_cols); + } + + OffsetT _num_cols; + ::std::size_t _row_stride; +}; + +// Unary functor which maps and index into an interleaved array by its active +// channel +template struct __interleaved_index_functor { + __interleaved_index_functor(const OffsetT &total_channels, + const OffsetT &active_channel) + : _total_channels(total_channels), _active_channel(active_channel) {} + + template Index operator()(const Index &i) const { + return i * _total_channels + _active_channel; + } + + OffsetT _total_channels; + OffsetT _active_channel; +}; + +} // end namespace internal + +} // end namespace dpct + +#endif diff --git a/dpct/dpl_extras/iterators.h b/dpct/dpl_extras/iterators.h new file mode 100644 index 0000000000000..2e1d10986728e --- /dev/null +++ b/dpct/dpl_extras/iterators.h @@ -0,0 +1,347 @@ +//==---- iterators.h ------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_ITERATORS_H__ +#define __DPCT_ITERATORS_H__ + +#include + +#include "functional.h" + +namespace dpct { + +namespace internal { + +// Wrapper class returned from a dereferenced transform_iterator which was +// created using +// make_transform_output_iterator(). Used to apply the supplied transform +// function when writing into an object of this class. +// +// Example: +// int a[] = {0, 1, 2, 3, 4}; +// int* p = a; +// auto f = [](auto v) {return v*v;}; +// auto tr_out = dpct::make_transform_output_iterator(p+1, f); +// auto wrap = *tr_out; // wrap is a transform_output_ref_wrapper +// std::cout<<*(p+1)< class transform_output_ref_wrapper { +private: + T __my_reference_; + _UnaryFunc __my_unary_func_; + +public: + template + transform_output_ref_wrapper(U &&__reference, _UnaryFunc __unary_func) + : __my_reference_(std::forward(__reference)), + __my_unary_func_(__unary_func) {} + + // When writing to an object of this type, apply the supplied unary function, + // then write to the wrapped reference + template + transform_output_ref_wrapper &operator=(const UnaryInputType &e) { + __my_reference_ = __my_unary_func_(e); + return *this; + } +}; + +// Unary functor to create a transform_output_reference_wrapper when a +// transform_iterator is dereferenced, so that a +// the supplied unary function may be applied on write, resulting in a +// transform_output_iterator +template struct _Unary_Out { + _Unary_Out(_UnaryFunc __f_) : __f(__f_) {} + _UnaryFunc __f; + template auto operator()(T &&val) const { + return transform_output_ref_wrapper(std::forward(val), + __f); + } +}; + +} // end namespace internal + +using std::advance; + +using std::distance; + +template +oneapi::dpl::counting_iterator make_counting_iterator(const T &input) { + return oneapi::dpl::counting_iterator(input); +} + +template class constant_iterator { +public: + typedef std::false_type is_hetero; + typedef std::true_type is_passed_directly; + typedef std::ptrdiff_t difference_type; + typedef _Tp value_type; + typedef _Tp *pointer; + // There is no storage behind the iterator, so we return a value instead of + // reference. + typedef const _Tp reference; + typedef const _Tp const_reference; + typedef std::random_access_iterator_tag iterator_category; + + explicit constant_iterator(_Tp __init) + : __my_value_(__init), __my_counter_(0) {} + +private: + // used to construct iterator instances with different counter values required + // by arithmetic operators + constant_iterator(const _Tp &__value, const difference_type &__offset) + : __my_value_(__value), __my_counter_(__offset) {} + +public: + // non-const variants of access operators are not provided so unintended + // writes are caught at compile time. + const_reference operator*() const { return __my_value_; } + const_reference operator[](difference_type) const { return __my_value_; } + + difference_type operator-(const constant_iterator &__it) const { + return __my_counter_ - __it.__my_counter_; + } + + constant_iterator &operator+=(difference_type __forward) { + __my_counter_ += __forward; + return *this; + } + constant_iterator &operator-=(difference_type __backward) { + return *this += -__backward; + } + constant_iterator &operator++() { return *this += 1; } + constant_iterator &operator--() { return *this -= 1; } + + constant_iterator operator++(int) { + constant_iterator __it(*this); + ++(*this); + return __it; + } + constant_iterator operator--(int) { + constant_iterator __it(*this); + --(*this); + return __it; + } + + constant_iterator operator-(difference_type __backward) const { + return constant_iterator(__my_value_, __my_counter_ - __backward); + } + constant_iterator operator+(difference_type __forward) const { + return constant_iterator(__my_value_, __my_counter_ + __forward); + } + friend constant_iterator operator+(difference_type __forward, + const constant_iterator __it) { + return __it + __forward; + } + + bool operator==(const constant_iterator &__it) const { + return __my_value_ == __it.__my_value_ && + this->__my_counter_ == __it.__my_counter_; + } + bool operator!=(const constant_iterator &__it) const { + return !(*this == __it); + } + bool operator<(const constant_iterator &__it) const { + return *this - __it < 0; + } + bool operator>(const constant_iterator &__it) const { return __it < *this; } + bool operator<=(const constant_iterator &__it) const { + return !(*this > __it); + } + bool operator>=(const constant_iterator &__it) const { + return !(*this < __it); + } + +private: + _Tp __my_value_; + uint64_t __my_counter_; +}; + +template +constant_iterator<_Tp> make_constant_iterator(_Tp __value) { + return constant_iterator<_Tp>(__value); +} + +// key_value_pair class to represent a key and value, specifically a +// dereferenced arg_index_input_iterator +template class key_value_pair { +public: + key_value_pair() = default; + + key_value_pair(const _KeyTp &_key, const _ValueTp &_value) + : key(_key), value(_value) {} + + bool operator==(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const { + return (key == _kvp.key) && (value == _kvp.value); + } + + bool operator!=(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const { + return (key != _kvp.key) || (value != _kvp.value); + } + + _KeyTp key; + _ValueTp value; +}; + +namespace detail { + +template struct make_key_value_pair { + template + key_value_pair + operator()(const oneapi::dpl::__internal::tuple &tup) const { + return ::dpct::key_value_pair(::std::get<0>(tup), + ::std::get<1>(tup)); + } +}; + +template struct __zip_iterator_impl; +template struct __zip_iterator_impl> { + using type = oneapi::dpl::zip_iterator; +}; + +} // end namespace detail + +// dpct::zip_iterator can only accept std::tuple type as template argument for +// compatibility purpose. Please use oneapi::dpl::zip_iterator if you want to +// pass iterator's types directly. +template +using zip_iterator = typename detail::__zip_iterator_impl::type; + +// arg_index_input_iterator is an iterator over a input iterator, with a index. +// When dereferenced, it returns a key_value_pair, which can be interrogated for +// the index key or the value from the input iterator +template ::value_type> +class arg_index_input_iterator + : public oneapi::dpl::transform_iterator< + oneapi::dpl::zip_iterator, + InputIteratorT>, + detail::make_key_value_pair> { + using arg_index_input_iterator_wrap = oneapi::dpl::transform_iterator< + oneapi::dpl::zip_iterator, + InputIteratorT>, + detail::make_key_value_pair>; + +public: + typedef OffsetT difference_type; + + // signal to __get_sycl_range that this iterator is as a direct pass iterator + using is_zip = ::std::true_type; + + arg_index_input_iterator(const arg_index_input_iterator_wrap &__arg_wrap) + : arg_index_input_iterator_wrap(__arg_wrap) {} + arg_index_input_iterator(InputIteratorT __iter) + : arg_index_input_iterator_wrap( + oneapi::dpl::make_zip_iterator( + oneapi::dpl::counting_iterator(OffsetT(0)), __iter), + detail::make_key_value_pair()) {} + + arg_index_input_iterator &operator=(const arg_index_input_iterator &__input) { + arg_index_input_iterator_wrap::operator=(__input); + return *this; + } + arg_index_input_iterator &operator++() { + arg_index_input_iterator_wrap::operator++(); + return *this; + } + arg_index_input_iterator &operator--() { + arg_index_input_iterator_wrap::operator--(); + return *this; + } + arg_index_input_iterator operator++(int) { + arg_index_input_iterator __it(*this); + ++(*this); + return __it; + } + arg_index_input_iterator operator--(int) { + arg_index_input_iterator __it(*this); + --(*this); + return __it; + } + arg_index_input_iterator operator+(difference_type __forward) const { + return arg_index_input_iterator( + arg_index_input_iterator_wrap::operator+(__forward)); + } + arg_index_input_iterator operator-(difference_type __backward) const { + return arg_index_input_iterator( + arg_index_input_iterator_wrap::operator-(__backward)); + } + arg_index_input_iterator &operator+=(difference_type __forward) { + arg_index_input_iterator_wrap::operator+=(__forward); + return *this; + } + arg_index_input_iterator &operator-=(difference_type __backward) { + arg_index_input_iterator_wrap::operator-=(__backward); + return *this; + } + + friend arg_index_input_iterator + operator+(difference_type __forward, const arg_index_input_iterator &__it) { + return __it + __forward; + } + + difference_type operator-(const arg_index_input_iterator &__it) const { + return arg_index_input_iterator_wrap::operator-(__it); + } + bool operator==(const arg_index_input_iterator &__it) const { + return arg_index_input_iterator_wrap::operator==(__it); + } + bool operator!=(const arg_index_input_iterator &__it) const { + return !(*this == __it); + } + bool operator<(const arg_index_input_iterator &__it) const { + return *this - __it < 0; + } + bool operator>(const arg_index_input_iterator &__it) const { + return __it < *this; + } + bool operator<=(const arg_index_input_iterator &__it) const { + return !(*this > __it); + } + bool operator>=(const arg_index_input_iterator &__it) const { + return !(*this < __it); + } + + // returns an arg_index_input_iterator with the same iter position, but a + // count reset to 0 + arg_index_input_iterator create_normalized() { + return arg_index_input_iterator( + ::std::get<1>(arg_index_input_iterator_wrap::base().base())); + } +}; + +template struct io_iterator_pair { + inline io_iterator_pair() : selector(false) {} + + inline io_iterator_pair(const IterT &first, const IterT &second) + : selector(false) { + iter[0] = first; + iter[1] = second; + } + + inline IterT first() const { return selector ? iter[1] : iter[0]; } + + inline IterT second() const { return selector ? iter[0] : iter[1]; } + + inline void swap() { selector = !selector; } + + bool selector; + + IterT iter[2]; +}; + +template +auto make_transform_output_iterator(_Iter __it, _UnaryFunc __unary_func) { + return oneapi::dpl::transform_iterator( + __it, internal::_Unary_Out<_UnaryFunc>(__unary_func)); +} + +} // end namespace dpct + +#endif diff --git a/dpct/dpl_extras/memory.h b/dpct/dpl_extras/memory.h new file mode 100644 index 0000000000000..08b965133f519 --- /dev/null +++ b/dpct/dpl_extras/memory.h @@ -0,0 +1,1024 @@ +//==---- memory.h ---------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_MEMORY_H__ +#define __DPCT_MEMORY_H__ + +#include +#include +#include "functional.h" + +// Memory management section: +// device_pointer, device_reference, swap, device_iterator, malloc_device, +// device_new, free_device, device_delete +namespace dpct { +namespace detail { +template +struct make_allocatable +{ + using type = T; +}; + +template <> +struct make_allocatable +{ + using type = dpct::byte_t; +}; + +#if defined(__LIBSYCL_MAJOR_VERSION) && defined(__LIBSYCL_MINOR_VERSION) && \ + defined(__LIBSYCL_PATCH_VERSION) +#define _DPCT_LIBSYCL_VERSION \ + (__LIBSYCL_MAJOR_VERSION * 10000 + __LIBSYCL_MINOR_VERSION * 100 + \ + __LIBSYCL_PATCH_VERSION) +#else +#define _DPCT_LIBSYCL_VERSION 0 +#endif + +template +using __buffer_allocator = +#if _DPCT_LIBSYCL_VERSION >= 60000 + sycl::buffer_allocator::type>; +#else + sycl::buffer_allocator; +#endif +} // namespace detail + +#ifdef DPCT_USM_LEVEL_NONE +template > +class device_pointer; +#else +template class device_pointer; +#endif + +template struct device_reference { + using pointer = device_pointer; + using value_type = T; + template + device_reference(const device_reference &input) + : value(input.value) {} + device_reference(const pointer &input) : value((*input).value) {} + device_reference(value_type &input) : value(input) {} + template + device_reference &operator=(const device_reference &input) { + value = input; + return *this; + }; + device_reference &operator=(const device_reference &input) { + T val = input.value; + value = val; + return *this; + }; + device_reference &operator=(const value_type &x) { + value = x; + return *this; + }; + pointer operator&() const { return pointer(&value); }; + operator value_type() const { return T(value); } + device_reference &operator++() { + ++value; + return *this; + }; + device_reference &operator--() { + --value; + return *this; + }; + device_reference operator++(int) { + device_reference ref(*this); + ++(*this); + return ref; + }; + device_reference operator--(int) { + device_reference ref(*this); + --(*this); + return ref; + }; + device_reference &operator+=(const T &input) { + value += input; + return *this; + }; + device_reference &operator-=(const T &input) { + value -= input; + return *this; + }; + device_reference &operator*=(const T &input) { + value *= input; + return *this; + }; + device_reference &operator/=(const T &input) { + value /= input; + return *this; + }; + device_reference &operator%=(const T &input) { + value %= input; + return *this; + }; + device_reference &operator&=(const T &input) { + value &= input; + return *this; + }; + device_reference &operator|=(const T &input) { + value |= input; + return *this; + }; + device_reference &operator^=(const T &input) { + value ^= input; + return *this; + }; + device_reference &operator<<=(const T &input) { + value <<= input; + return *this; + }; + device_reference &operator>>=(const T &input) { + value >>= input; + return *this; + }; + void swap(device_reference &input) { + T tmp = (*this); + *this = (input); + input = (tmp); + } + T &value; +}; + +template +void swap(device_reference &x, device_reference &y) { + x.swap(y); +} + +template void swap(T &x, T &y) { + T tmp = x; + x = y; + y = tmp; +} + +template +::std::ostream &operator<<(::std::ostream &out, + const device_reference &ref) { + return out << T(ref); +} + +namespace internal { +// struct for checking if iterator is heterogeneous or not +template // for non-heterogeneous iterators +struct is_hetero_iterator : std::false_type {}; + +template // for heterogeneous iterators +struct is_hetero_iterator< + Iter, typename std::enable_if::type> + : std::true_type {}; +} // namespace internal + +#ifdef DPCT_USM_LEVEL_NONE +// Must be forward declared due to default argument +template +device_pointer device_new(device_pointer, const T &, + const std::size_t = 1); + +template +class device_iterator; + +template +class device_pointer_base { +protected: + sycl::buffer buffer; + std::size_t idx; + + // Declare friend to give access to protected buffer and idx members + template + friend device_pointer device_new(device_pointer, const T &, + const std::size_t); + +public: + using pointer = ValueType *; + using difference_type = std::make_signed::type; + + device_pointer_base(sycl::buffer in, std::size_t i = 0) + : buffer(in), idx(i) {} +#ifdef __USE_DPCT + template + device_pointer_base(OtherT *ptr) + : buffer( + dpct::detail::mem_mgr::instance() + .translate_ptr(ptr) + .buffer.template reinterpret(sycl::range<1>( + dpct::detail::mem_mgr::instance().translate_ptr(ptr).size / + sizeof(ValueType)))), + idx(ptr - (ValueType*)dpct::detail::mem_mgr::instance() + .translate_ptr(ptr).alloc_ptr) {} +#endif + device_pointer_base(const std::size_t count) + : buffer(sycl::range<1>(count / sizeof(ValueType))), idx() {} + // buffer has no default ctor we pass zero-range to create an empty buffer + device_pointer_base() : buffer(sycl::range<1>(0)) {} + device_pointer_base(const device_pointer_base &in) + : buffer(in.buffer), idx(in.idx) {} + pointer get() const { + auto res = + (const_cast(this) + ->buffer.template get_access()) + .get_pointer(); + return res + idx; + } + operator ValueType *() { + auto res = (buffer.template get_access()) + .get_pointer(); + return res + idx; + } + operator ValueType *() const { + auto res = + (const_cast(this) + ->buffer.template get_access()) + .get_pointer(); + return res + idx; + } + Derived operator+(difference_type forward) const { + return Derived{buffer, idx + forward}; + } + Derived operator-(difference_type backward) const { + return Derived{buffer, idx - backward}; + } + Derived operator++(int) { + Derived p(buffer, idx); + idx += 1; + return p; + } + Derived operator--(int) { + Derived p(buffer, idx); + idx -= 1; + return p; + } + difference_type operator-(const Derived &it) const { return idx - it.idx; } + template + typename std::enable_if::value, + difference_type>::type + operator-(const OtherIterator &it) const { + return idx - std::distance(oneapi::dpl::begin(buffer), it); + } + + std::size_t get_idx() const { return idx; } // required + + sycl::buffer get_buffer() { + return buffer; + } // required +}; + +template +class device_pointer + : public device_pointer_base> { +private: + using base_type = + device_pointer_base; + +public: + using value_type = dpct::byte_t; + using difference_type = std::make_signed::type; + using pointer = void *; + using reference = value_type &; + using iterator_category = std::random_access_iterator_tag; + using is_hetero = std::true_type; // required + using is_passed_directly = std::false_type; + static constexpr sycl::access_mode mode = Mode; // required + + device_pointer(sycl::buffer in, std::size_t i = 0) + : base_type(in, i) {} +#ifdef __USE_DPCT + template device_pointer(OtherT *ptr) : base_type(ptr) {} +#endif + // needed for malloc_device, count is number of bytes to allocate + device_pointer(const std::size_t count) : base_type(count) {} + device_pointer() : base_type() {} + device_pointer(const device_pointer &in) : base_type(in) {} + device_pointer &operator+=(difference_type forward) { + this->idx += forward; + return *this; + } + device_pointer &operator-=(difference_type backward) { + this->idx -= backward; + return *this; + } + // include operators from base class + using base_type::operator++; + using base_type::operator--; + device_pointer &operator++() { + this->idx += 1; + return *this; + } + device_pointer &operator--() { + this->idx -= 1; + return *this; + } +}; + +template +class device_pointer + : public device_pointer_base> { +private: + using base_type = device_pointer_base; + +public: + using value_type = T; + using difference_type = std::make_signed::type; + using pointer = T *; + using reference = T &; + using iterator_category = std::random_access_iterator_tag; + using is_hetero = std::true_type; // required + using is_passed_directly = std::false_type; + static constexpr sycl::access_mode mode = Mode; // required + + device_pointer(sycl::buffer in, std::size_t i = 0) : base_type(in, i) {} +#ifdef __USE_DPCT + template device_pointer(OtherT *ptr) : base_type(ptr) {} +#endif + // needed for malloc_device, count is number of bytes to allocate + device_pointer(const std::size_t count) : base_type(count) {} + device_pointer() : base_type() {} + device_pointer(const device_pointer &in) : base_type(in) {} + device_pointer &operator+=(difference_type forward) { + this->idx += forward; + return *this; + } + device_pointer &operator-=(difference_type backward) { + this->idx -= backward; + return *this; + } + operator device_pointer() { + auto converted_buf = (this->buffer) + .template reinterpret(sycl::range<1>( + sizeof(value_type) * this->buffer.size())); + return device_pointer(converted_buf, this->idx); + } + // include operators from base class + using base_type::operator++; + using base_type::operator--; + device_pointer &operator++() { + this->idx += 1; + return *this; + } + device_pointer &operator--() { + this->idx -= 1; + return *this; + } +}; +#else +template class device_iterator; + +template class device_pointer_base { +protected: + ValueType *ptr; + +public: + using pointer = ValueType *; + using difference_type = std::make_signed::type; + + device_pointer_base(ValueType *p) : ptr(p) {} + device_pointer_base(const std::size_t count) { + sycl::queue default_queue = dpct::get_default_queue(); + ptr = static_cast(sycl::malloc_shared( + count, default_queue.get_device(), default_queue.get_context())); + } + device_pointer_base() {} + pointer get() const { return ptr; } + operator ValueType *() { return ptr; } + operator ValueType *() const { return ptr; } + + ValueType &operator[](difference_type idx) { return ptr[idx]; } + ValueType &operator[](difference_type idx) const { return ptr[idx]; } + + Derived operator+(difference_type forward) const { + return Derived{ptr + forward}; + } + Derived operator-(difference_type backward) const { + return Derived{ptr - backward}; + } + Derived operator++(int) { + Derived p(ptr); + ++ptr; + return p; + } + Derived operator--(int) { + Derived p(ptr); + --ptr; + return p; + } + difference_type operator-(const Derived &it) const { return ptr - it.ptr; } +}; + +template <> +class device_pointer + : public device_pointer_base> { +private: + using base_type = device_pointer_base>; + +public: + using value_type = dpct::byte_t; + using difference_type = std::make_signed::type; + using pointer = void *; + using reference = value_type &; + using const_reference = const value_type &; + using iterator_category = std::random_access_iterator_tag; + using is_hetero = std::false_type; // required + using is_passed_directly = std::true_type; // required + + device_pointer(void *p) : base_type(static_cast(p)) {} + // needed for malloc_device, count is number of bytes to allocate + device_pointer(const std::size_t count) : base_type(count) {} + device_pointer() : base_type() {} + pointer get() const { return static_cast(this->ptr); } + operator void *() { return this->ptr; } + operator void *() const { return this->ptr; } + + // include operators from base class + using base_type::operator++; + using base_type::operator--; + device_pointer &operator++() { + ++(this->ptr); + return *this; + } + device_pointer &operator--() { + --(this->ptr); + return *this; + } + device_pointer &operator+=(difference_type forward) { + this->ptr = this->ptr + forward; + return *this; + } + device_pointer &operator-=(difference_type backward) { + this->ptr = this->ptr - backward; + return *this; + } +}; + +template +class device_pointer : public device_pointer_base> { +private: + using base_type = device_pointer_base>; + +public: + using value_type = T; + using difference_type = std::make_signed::type; + using pointer = T *; + using reference = T &; + using const_reference = const T &; + using iterator_category = std::random_access_iterator_tag; + using is_hetero = std::false_type; // required + using is_passed_directly = std::true_type; // required + + device_pointer(T *p) : base_type(p) {} + // needed for malloc_device, count is number of bytes to allocate + device_pointer(const std::size_t count) : base_type(count) {} + device_pointer() : base_type() {} + device_pointer &operator=(const device_iterator &in) { + this->ptr = static_cast>(in).ptr; + return *this; + } + operator device_pointer() { + return device_pointer(static_cast(this->ptr)); + } + // include operators from base class + using base_type::operator++; + using base_type::operator--; + device_pointer &operator++() { + ++(this->ptr); + return *this; + } + device_pointer &operator--() { + --(this->ptr); + return *this; + } + device_pointer &operator+=(difference_type forward) { + this->ptr = this->ptr + forward; + return *this; + } + device_pointer &operator-=(difference_type backward) { + this->ptr = this->ptr - backward; + return *this; + } +}; +#endif + +#ifdef DPCT_USM_LEVEL_NONE +template > +class device_iterator : public device_pointer { + using Base = device_pointer; + +public: + using value_type = T; + using difference_type = std::make_signed::type; + using pointer = T *; + using reference = T &; + using iterator_category = std::random_access_iterator_tag; + using is_hetero = std::true_type; // required + using is_passed_directly = std::false_type; // required + static constexpr sycl::access_mode mode = Mode; // required + + device_iterator() : Base() {} + device_iterator(sycl::buffer vec, std::size_t index) + : Base(vec, index) {} + device_iterator(const Base &dev_ptr) : Base(dev_ptr) {} + template + device_iterator(const device_iterator &in) + : Base(in.buffer, in.idx) {} // required for iter_mode + device_iterator &operator=(const device_iterator &in) { + Base::buffer = in.buffer; + Base::idx = in.idx; + return *this; + } + + reference operator*() const { + return const_cast(this) + ->buffer.template get_access()[Base::idx]; + } + + reference operator[](difference_type i) const { return *(*this + i); } + device_iterator &operator++() { + ++Base::idx; + return *this; + } + device_iterator &operator--() { + --Base::idx; + return *this; + } + device_iterator operator++(int) { + device_iterator it(*this); + ++(*this); + return it; + } + device_iterator operator--(int) { + device_iterator it(*this); + --(*this); + return it; + } + device_iterator operator+(difference_type forward) const { + const auto new_idx = Base::idx + forward; + return {Base::buffer, new_idx}; + } + device_iterator &operator+=(difference_type forward) { + Base::idx += forward; + return *this; + } + device_iterator operator-(difference_type backward) const { + return {Base::buffer, Base::idx - backward}; + } + device_iterator &operator-=(difference_type backward) { + Base::idx -= backward; + return *this; + } + friend device_iterator operator+(difference_type forward, + const device_iterator &it) { + return it + forward; + } + difference_type operator-(const device_iterator &it) const { + return Base::idx - it.idx; + } + template + typename std::enable_if::value, + difference_type>::type + operator-(const OtherIterator &it) const { + return Base::idx - std::distance(oneapi::dpl::begin(Base::buffer), it); + } + bool operator==(const device_iterator &it) const { return *this - it == 0; } + bool operator!=(const device_iterator &it) const { return !(*this == it); } + bool operator<(const device_iterator &it) const { return *this - it < 0; } + bool operator>(const device_iterator &it) const { return it < *this; } + bool operator<=(const device_iterator &it) const { return !(*this > it); } + bool operator>=(const device_iterator &it) const { return !(*this < it); } + + std::size_t get_idx() const { return Base::idx; } // required + + sycl::buffer get_buffer() { + return Base::buffer; + } // required +}; +#else +template class device_iterator : public device_pointer { + using Base = device_pointer; + +protected: + std::size_t idx; + +public: + using value_type = T; + using difference_type = std::make_signed::type; + using pointer = typename Base::pointer; + using reference = typename Base::reference; + using iterator_category = std::random_access_iterator_tag; + using is_hetero = std::false_type; // required + using is_passed_directly = std::true_type; // required + static constexpr sycl::access_mode mode = + sycl::access_mode::read_write; // required + + device_iterator() : Base(nullptr), idx(0) {} + device_iterator(T *vec, std::size_t index) : Base(vec), idx(index) {} + device_iterator(const Base &dev_ptr) : Base(dev_ptr), idx(0) {} + template + device_iterator(const device_iterator &in) + : Base(in.ptr), idx(in.idx) {} // required for iter_mode + device_iterator &operator=(const device_iterator &in) { + Base::operator=(in); + idx = in.idx; + return *this; + } + + reference operator*() const { return *(Base::ptr + idx); } + + reference operator[](difference_type i) { return Base::ptr[idx + i]; } + reference operator[](difference_type i) const { return Base::ptr[idx + i]; } + device_iterator &operator++() { + ++idx; + return *this; + } + device_iterator &operator--() { + --idx; + return *this; + } + device_iterator operator++(int) { + device_iterator it(*this); + ++(*this); + return it; + } + device_iterator operator--(int) { + device_iterator it(*this); + --(*this); + return it; + } + device_iterator operator+(difference_type forward) const { + const auto new_idx = idx + forward; + return {Base::ptr, new_idx}; + } + device_iterator &operator+=(difference_type forward) { + idx += forward; + return *this; + } + device_iterator operator-(difference_type backward) const { + return {Base::ptr, idx - backward}; + } + device_iterator &operator-=(difference_type backward) { + idx -= backward; + return *this; + } + friend device_iterator operator+(difference_type forward, + const device_iterator &it) { + return it + forward; + } + difference_type operator-(const device_iterator &it) const { + return idx - it.idx; + } + + template + typename std::enable_if::value, + difference_type>::type + operator-(const OtherIterator &it) const { + return idx - it.get_idx(); + } + + bool operator==(const device_iterator &it) const { return *this - it == 0; } + bool operator!=(const device_iterator &it) const { return !(*this == it); } + bool operator<(const device_iterator &it) const { return *this - it < 0; } + bool operator>(const device_iterator &it) const { return it < *this; } + bool operator<=(const device_iterator &it) const { return !(*this > it); } + bool operator>=(const device_iterator &it) const { return !(*this < it); } + + std::size_t get_idx() const { return idx; } // required + + device_iterator &get_buffer() { return *this; } // required + + std::size_t size() const { return idx; } +}; +#endif + +struct sys_tag {}; +struct device_sys_tag : public sys_tag {}; +struct host_sys_tag : public sys_tag {}; + +#ifdef DPCT_USM_LEVEL_NONE +template class tagged_pointer { + static_assert(false, + "tagged_pointer is not supported with DPCT_USM_LEVEL_NONE"); +}; +template +void release_temporary_allocation(PolicyOrTag &&policy_or_tag, Pointer ptr) { + static_assert( + false, + "release_temporary_allocation is not supported with DPCT_USM_LEVEL_NONE"); +} +template +auto get_temporary_allocation(PolicyOrTag &&policy_or_tag, + SizeType num_elements) { + static_assert( + false, + "get_temporary_allocation is not supported with DPCT_USM_LEVEL_NONE"); +} +template +auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) { + static_assert(false, "malloc is not supported with DPCT_USM_LEVEL_NONE"); +} +template +auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_elements) { + static_assert(false, "malloc is not supported with DPCT_USM_LEVEL_NONE"); +} +template +void free(PolicyOrTag &&policy_or_tag, Pointer ptr) { + static_assert(false, "free is not supported with DPCT_USM_LEVEL_NONE"); +} +#else +namespace internal { + +// Utility that converts a policy to a tag or reflects a provided tag +template struct policy_or_tag_to_tag { +private: + using decayed_policy_or_tag_t = ::std::decay_t; + using policy_conversion = ::std::conditional_t< + !is_hetero_execution_policy::value, host_sys_tag, + device_sys_tag>; + static constexpr bool is_policy_v = + oneapi::dpl::execution::is_execution_policy_v; + static constexpr bool is_sys_tag_v = ::std::disjunction_v< + ::std::is_same, + ::std::is_same>; + static_assert(is_policy_v || is_sys_tag_v, + "Only oneDPL policies or system tags may be provided"); + +public: + using type = ::std::conditional_t; +}; + +template +using policy_or_tag_to_tag_t = typename policy_or_tag_to_tag::type; + +template struct is_host_policy_or_tag { +private: + using tag_t = policy_or_tag_to_tag_t; + +public: + static constexpr bool value = ::std::is_same_v; +}; + +template +inline constexpr bool is_host_policy_or_tag_v = + is_host_policy_or_tag::value; + +} // namespace internal + +// TODO: Make this class an iterator adaptor. +// tagged_pointer provides a wrapper around a raw pointer type with a tag of the +// location of the allocated memory. Standard pointer operations are supported +// with this class. +template class tagged_pointer { +public: + using value_type = T; + using difference_type = ::std::ptrdiff_t; + using pointer = T *; + using reference = T &; + using iterator_category = std::random_access_iterator_tag; + using is_hetero = ::std::false_type; + using is_passed_directly = std::true_type; + + tagged_pointer() : m_ptr(nullptr) {} + tagged_pointer(T *ptr) : m_ptr(ptr) {} + T &operator[](difference_type idx) { return this->m_ptr[idx]; } + const T &operator[](difference_type idx) const { return this->m_ptr[idx]; } + tagged_pointer operator+(difference_type forward) const { + return tagged_pointer{this->m_ptr + forward}; + } + tagged_pointer operator-(difference_type backward) const { + return tagged_pointer{this->m_ptr - backward}; + } + operator const T *() const { return m_ptr; } + operator T *() { return m_ptr; } + T &operator*() { return *this->m_ptr; } + const T &operator*() const { return *this->m_ptr; } + T *operator->() { return this->m_ptr; } + const T *operator->() const { return this->m_ptr; } + tagged_pointer operator++(int) { + tagged_pointer p(this->m_ptr); + ++this->m_ptr; + return p; + } + tagged_pointer operator--(int) { + tagged_pointer p(this->m_ptr); + --this->m_ptr; + return p; + } + tagged_pointer &operator++() { + ++this->m_ptr; + return *this; + } + tagged_pointer &operator--() { + --this->m_ptr; + return *this; + } + difference_type operator-(const tagged_pointer &it) const { + return this->m_ptr - it.m_ptr; + } + tagged_pointer &operator+=(difference_type forward) { + this->m_ptr = this->m_ptr + forward; + return *this; + } + tagged_pointer &operator-=(difference_type backward) { + this->m_ptr = this->m_ptr - backward; + return *this; + } + +private: + T *m_ptr; +}; + +// Void specialization for tagged pointers. Iterator traits are not provided but +// conversion to other non-void tagged pointers is allowed. Pointer arithmetic +// is disallowed with this specialization. +template class tagged_pointer { +public: + using difference_type = ::std::ptrdiff_t; + using pointer = void *; + tagged_pointer() : m_ptr(nullptr) {} + tagged_pointer(pointer ptr) : m_ptr(ptr) {} + operator const void *() const { return m_ptr; } + operator void *() { return m_ptr; } + // Enable tagged void pointer to convert to all other raw pointer types. + template operator OtherPtr *() const { + return static_cast(this->m_ptr); + } + +private: + void *m_ptr; +}; + +namespace internal { + +// Internal utility to return raw pointer to allocated memory. Note that host +// allocations are not device accessible (not pinned). +template +void *malloc_base(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) { + using decayed_policy_or_tag_t = ::std::decay_t; + if constexpr (internal::is_host_policy_or_tag_v) { + return ::std::malloc(num_bytes); + } else { + sycl::queue q; + // Grab the associated queue if a device policy is provided. Otherwise, use + // default constructed. + if constexpr (oneapi::dpl::execution::is_execution_policy_v< + decayed_policy_or_tag_t>) { + q = policy_or_tag.queue(); + } else { + q = get_default_queue(); + } + return sycl::malloc_shared(num_bytes, q); + } +} + +} // namespace internal + +template +auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) { + return tagged_pointer>( + internal::malloc_base(::std::forward(policy_or_tag), + num_bytes)); +} + +template +auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_elements) { + return tagged_pointer>( + static_cast( + internal::malloc_base(::std::forward(policy_or_tag), + num_elements * sizeof(T)))); +} + +template +void free(PolicyOrTag &&policy_or_tag, Pointer ptr) { + using decayed_policy_or_tag_t = ::std::decay_t; + if constexpr (internal::is_host_policy_or_tag_v) { + ::std::free(ptr); + } else { + sycl::queue q; + // Grab the associated queue if a device policy is provided. Otherwise, use + // default constructed. + if constexpr (oneapi::dpl::execution::is_execution_policy_v< + decayed_policy_or_tag_t>) { + q = policy_or_tag.queue(); + } else { + q = get_default_queue(); + } + sycl::free(ptr, q); + } +} + +template +auto get_temporary_allocation(PolicyOrTag &&policy_or_tag, + SizeType num_elements) { + auto allocation_ptr = + dpct::malloc(::std::forward(policy_or_tag), num_elements); + if (allocation_ptr == nullptr) + return ::std::make_pair(allocation_ptr, SizeType(0)); + return ::std::make_pair(allocation_ptr, num_elements); +} + +template +void release_temporary_allocation(PolicyOrTag &&policy_or_tag, Pointer ptr) { + dpct::free(::std::forward(policy_or_tag), ptr); +} +#endif + +template +device_pointer malloc_device(const std::size_t num_elements) { + return device_pointer(num_elements * sizeof(T)); +} +static inline device_pointer malloc_device(const std::size_t num_bytes) { + return device_pointer(num_bytes); +} +#ifdef DPCT_USM_LEVEL_NONE +template +device_pointer device_new(device_pointer p, const T &value, + const std::size_t count) { + auto converted_buf = p.buffer.template reinterpret(sycl::range<1>(count)); + ::std::uninitialized_fill( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + oneapi::dpl::begin(converted_buf), + oneapi::dpl::end(converted_buf), value); + return device_pointer(converted_buf, p.idx); +} +// buffer manages lifetime +template void free_device(device_pointer ptr) {} +#else +template +device_pointer device_new(device_pointer p, const T &value, + const std::size_t count = 1) { + dpct::device_pointer converted_p(static_cast(p.get())); + ::std::uninitialized_fill( + oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + converted_p, converted_p + count, value); + return converted_p; +} +template void free_device(device_pointer ptr) { + sycl::free(ptr.get(), dpct::get_default_queue()); +} +#endif +template +device_pointer device_new(device_pointer p, + const std::size_t count = 1) { + return device_new(p, T{}, count); +} +template +device_pointer device_new(const std::size_t count = 1) { + return device_new(device_pointer(sizeof(T) * count), T{}, count); +} + +template +typename std::enable_if::value, void>::type +device_delete(device_pointer p, const std::size_t count = 1) { + ::std::destroy(oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()), + p, p + count); + free_device(p); +} +template +typename std::enable_if::value, void>::type +device_delete(device_pointer p, const std::size_t count = 1) { + free_device(p); +} + +template device_pointer get_device_pointer(T *ptr) { + return device_pointer(ptr); +} + +template +device_pointer get_device_pointer(const device_pointer &ptr) { + return device_pointer(ptr); +} + +template T *get_raw_pointer(const device_pointer &ptr) { + return ptr.get(); +} + +template Pointer get_raw_pointer(const Pointer &ptr) { + return ptr; +} + +template const T &get_raw_reference(const device_reference &ref) { + return ref.value; +} + +template T &get_raw_reference(device_reference &ref) { + return ref.value; +} + +template const T &get_raw_reference(const T &ref) { + return ref; +} + +template T &get_raw_reference(T &ref) { + return ref; +} + +} // namespace dpct + +#endif diff --git a/dpct/dpl_extras/numeric.h b/dpct/dpl_extras/numeric.h new file mode 100644 index 0000000000000..9864cd17359f3 --- /dev/null +++ b/dpct/dpl_extras/numeric.h @@ -0,0 +1,32 @@ +//==---- numeric.h --------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_NUMERIC_H__ +#define __DPCT_NUMERIC_H__ + +namespace dpct { + +template +T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1, + InputIt2 first2, T init) { + return std::transform_reduce(std::forward(policy), first1, last1, + first2, init); +} + +template +T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1, + InputIt2 first2, T init, BinaryOperation1 op1, + BinaryOperation2 op2) { + return std::transform_reduce(std::forward(policy), first1, last1, + first2, init, op1, op2); +} + +} // end namespace dpct + +#endif diff --git a/dpct/dpl_extras/vector.h b/dpct/dpl_extras/vector.h new file mode 100644 index 0000000000000..afba575ae1da9 --- /dev/null +++ b/dpct/dpl_extras/vector.h @@ -0,0 +1,752 @@ +//==---- vector.h ---------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_VECTOR_H__ +#define __DPCT_VECTOR_H__ + +#include +#include + +#include + +#include "memory.h" + +#include +#include +#include + +#include "../device.hpp" + +namespace dpct { + +namespace internal { +template // for non-iterators +struct is_iterator : std::false_type {}; + +template // For iterators +struct is_iterator< + Iter, + typename std::enable_if< + !std::is_void::value, void>::type> + : std::true_type {}; + +template // For pointers +struct is_iterator : std::true_type {}; +} // end namespace internal + +#ifndef DPCT_USM_LEVEL_NONE + +template > +class device_vector { +public: + using iterator = device_iterator; + using const_iterator = const iterator; + using reference = device_reference; + using const_reference = const reference; + using value_type = T; + using pointer = T *; + using const_pointer = const T *; + using difference_type = + typename ::std::iterator_traits::difference_type; + using size_type = ::std::size_t; + +private: + Allocator _alloc; + size_type _size; + size_type _capacity; + pointer _storage; + + size_type _min_capacity() const { return size_type(1); } + + void _set_capacity_and_alloc() { + _capacity = ::std::max(_size * 2, _min_capacity()); + _storage = _alloc.allocate(_capacity); + } + +public: + template operator ::std::vector() const { + auto __tmp = ::std::vector(this->size()); + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + this->begin(), this->end(), __tmp.begin()); + return __tmp; + } + device_vector() + : _alloc(get_default_queue()), _size(0), _capacity(_min_capacity()) { + _set_capacity_and_alloc(); + } + ~device_vector() /*= default*/ { _alloc.deallocate(_storage, _capacity); }; + explicit device_vector(size_type n) : device_vector(n, T()) {} + explicit device_vector(size_type n, const T &value) + : _alloc(get_default_queue()), _size(n) { + _set_capacity_and_alloc(); + if (_size > 0) { + ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), + begin(), end(), T(value)); + } + } + device_vector(const device_vector &other) : _alloc(get_default_queue()) { + _size = other.size(); + _capacity = other.capacity(); + _storage = _alloc.allocate(_capacity); + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + other.begin(), other.end(), begin()); + } + device_vector(device_vector &&other) + : _alloc(get_default_queue()), _size(other.size()), + _capacity(other.capacity()), _storage(other._storage) { + other._size = 0; + other._capacity = 0; + other._storage = nullptr; + } + + template + device_vector(InputIterator first, + typename ::std::enable_if< + internal::is_iterator::value && + !::std::is_pointer::value && + ::std::is_same::iterator_category, + ::std::random_access_iterator_tag>::value, + InputIterator>::type last) + : _alloc(get_default_queue()) { + _size = ::std::distance(first, last); + _set_capacity_and_alloc(); + if (_size > 0) { + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + first, last, begin()); + } + } + + template + device_vector(InputIterator first, + typename ::std::enable_if<::std::is_pointer::value, + InputIterator>::type last) + : _alloc(get_default_queue()) { + _size = ::std::distance(first, last); + _set_capacity_and_alloc(); + if (_size > 0) { + auto ptr_type = sycl::get_pointer_type(first, get_default_context()); + if (ptr_type != sycl::usm::alloc::host && + ptr_type != sycl::usm::alloc::unknown) { + ::std::copy( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + first, last, begin()); + } else { + sycl::buffer::value_type, + 1> + buf(first, last); + auto buf_first = oneapi::dpl::begin(buf); + auto buf_last = oneapi::dpl::end(buf); + ::std::copy( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + buf_first, buf_last, begin()); + } + } + } + + template + device_vector(InputIterator first, + typename ::std::enable_if< + internal::is_iterator::value && + !::std::is_pointer::value && + !::std::is_same::iterator_category, + ::std::random_access_iterator_tag>::value, + InputIterator>::type last) + : _alloc(get_default_queue()), _size(::std::distance(first, last)) { + _set_capacity_and_alloc(); + ::std::vector _tmp(first, last); + if (_size > 0) { + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + _tmp.begin(), _tmp.end(), this->begin()); + } + } + + template + device_vector(const device_vector &v) + : _alloc(get_default_queue()), _storage(v.real_begin()), _size(v.size()), + _capacity(v.capacity()) {} + + template + device_vector(::std::vector &v) + : _alloc(get_default_queue()), _size(v.size()) { + _set_capacity_and_alloc(); + if (_size > 0) { + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + v.begin(), v.end(), this->begin()); + } + } + + template + device_vector &operator=(const ::std::vector &v) { + resize(v.size()); + if (_size > 0) { + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + v.begin(), v.end(), begin()); + } + return *this; + } + device_vector &operator=(const device_vector &other) { + // Copy assignment operator: + resize(other.size()); + if (_size > 0) { + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + other.begin(), other.end(), begin()); + } + return *this; + } + device_vector &operator=(device_vector &&other) { + // Move assignment operator: + device_vector dummy(::std::move(other)); + this->swap(dummy); + return *this; + } + size_type size() const { return _size; } + iterator begin() noexcept { return device_iterator(_storage, 0); } + iterator end() { return device_iterator(_storage, size()); } + const_iterator begin() const noexcept { + return device_iterator(_storage, 0); + } + const_iterator cbegin() const noexcept { return begin(); } + const_iterator end() const { return device_iterator(_storage, size()); } + const_iterator cend() const { return end(); } + T *real_begin() { return _storage; } + const T *real_begin() const { return _storage; } + void swap(device_vector &v) { + ::std::swap(_size, v._size); + ::std::swap(_capacity, v._capacity); + ::std::swap(_storage, v._storage); + ::std::swap(_alloc, v._alloc); + } + reference operator[](size_type n) { return _storage[n]; } + const_reference operator[](size_type n) const { return _storage[n]; } + void reserve(size_type n) { + if (n > capacity()) { + // allocate buffer for new size + auto tmp = _alloc.allocate(2 * n); + // copy content (old buffer to new buffer) + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + begin(), end(), tmp); + // deallocate old memory + _alloc.deallocate(_storage, _capacity); + _storage = tmp; + _capacity = 2 * n; + } + } + void resize(size_type new_size, const T &x = T()) { + reserve(new_size); + if (_size < new_size) { + ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), + begin() + _size, begin() + new_size, x); + } + _size = new_size; + } + size_type max_size(void) const { + return ::std::numeric_limits::max() / sizeof(T); + } + size_type capacity() const { return _capacity; } + const_reference front() const { return *begin(); } + reference front() { return *begin(); } + const_reference back(void) const { return *(end() - 1); } + reference back(void) { return *(end() - 1); } + pointer data(void) { return _storage; } + const_pointer data(void) const { return _storage; } + void shrink_to_fit(void) { + if (_size != capacity()) { + size_type tmp_capacity = ::std::max(_size, _min_capacity()); + auto tmp = _alloc.allocate(tmp_capacity); + if (_size > 0) { + ::std::copy( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + begin(), end(), tmp); + } + _alloc.deallocate(_storage, _capacity); + _storage = tmp; + _capacity = tmp_capacity; + } + } + void assign(size_type n, const T &x) { + resize(n); + if (_size > 0) { + ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), + begin(), begin() + n, x); + } + } + template + void + assign(InputIterator first, + typename ::std::enable_if::value, + InputIterator>::type last) { + auto n = ::std::distance(first, last); + resize(n); + if (_size > 0) { + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + first, last, begin()); + } + } + void clear(void) { _size = 0; } + bool empty(void) const { return (size() == 0); } + void push_back(const T &x) { insert(end(), size_type(1), x); } + void pop_back(void) { + if (_size > 0) + --_size; + } + iterator erase(iterator first, iterator last) { + auto n = ::std::distance(first, last); + if (last == end()) { + _size = _size - n; + return end(); + } + auto m = ::std::distance(last, end()); + if (m <= 0) { + return end(); + } + auto tmp = _alloc.allocate(m); + // copy remainder to temporary buffer. + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + last, end(), tmp); + // override (erase) subsequence in storage. + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + tmp, tmp + m, first); + _alloc.deallocate(tmp, m); + _size -= n; + return begin() + first.get_idx() + n; + } + iterator erase(iterator pos) { return erase(pos, pos + 1); } + iterator insert(iterator position, const T &x) { + auto n = ::std::distance(begin(), position); + insert(position, size_type(1), x); + return begin() + n; + } + void insert(iterator position, size_type n, const T &x) { + if (position == end()) { + resize(size() + n); + ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), + end() - n, end(), x); + } else { + auto i_n = ::std::distance(begin(), position); + // allocate temporary storage + auto m = ::std::distance(position, end()); + // will throw if position is not inside active vector + auto tmp = _alloc.allocate(m); + // copy remainder + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + position, end(), tmp); + + resize(size() + n); + // resizing might invalidate position + position = begin() + position.get_idx(); + + ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), + position, position + n, x); + + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + tmp, tmp + m, position + n); + _alloc.deallocate(tmp, m); + } + } + template + void + insert(iterator position, InputIterator first, + typename ::std::enable_if::value, + InputIterator>::type last) { + auto n = ::std::distance(first, last); + if (position == end()) { + resize(size() + n); + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + first, last, end()); + } else { + auto m = ::std::distance(position, end()); + // will throw if position is not inside active vector + auto tmp = _alloc.allocate(m); + + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + position, end(), tmp); + + resize(size() + n); + // resizing might invalidate position + position = begin() + position.get_idx(); + + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + first, last, position); + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + tmp, tmp + m, position + n); + _alloc.deallocate(tmp, m); + } + } + Allocator get_allocator() const { return _alloc; } +}; + +#else + +template > +class device_vector { + static_assert( + std::is_same>::value, + "device_vector doesn't support custom allocator when USM is not used."); + +public: + using iterator = device_iterator; + using const_iterator = const iterator; + using reference = device_reference; + using const_reference = const reference; + using value_type = T; + using pointer = T *; + using const_pointer = const T *; + using difference_type = + typename std::iterator_traits::difference_type; + using size_type = std::size_t; + +private: + using Buffer = sycl::buffer; + using Range = sycl::range<1>; + // Using mem_mgr to handle memory allocation + void *_storage; + size_type _size; + + size_type _min_capacity() const { return size_type(1); } + + void *alloc_store(size_type num_bytes) { + return detail::mem_mgr::instance().mem_alloc(num_bytes); + } + +public: + template operator std::vector() const { + auto __tmp = std::vector(this->size()); + std::copy(oneapi::dpl::execution::dpcpp_default, this->begin(), this->end(), + __tmp.begin()); + return __tmp; + } + device_vector() + : _storage(alloc_store(_min_capacity() * sizeof(T))), _size(0) {} + ~device_vector() = default; + explicit device_vector(size_type n) : device_vector(n, T()) {} + explicit device_vector(size_type n, const T &value) + : _storage(alloc_store(std::max(n, _min_capacity()) * sizeof(T))), + _size(n) { + auto buf = get_buffer(); + std::fill(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(buf), + oneapi::dpl::begin(buf) + n, T(value)); + } + device_vector(const device_vector &other) + : _storage(other._storage), _size(other.size()) {} + device_vector(device_vector &&other) + : _storage(std::move(other._storage)), _size(other.size()) {} + + template + device_vector(InputIterator first, + typename std::enable_if< + internal::is_iterator::value && + !std::is_pointer::value && + std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + InputIterator>::type last) + : _storage(alloc_store(std::distance(first, last) * sizeof(T))), + _size(std::distance(first, last)) { + auto buf = get_buffer(); + auto dst = oneapi::dpl::begin(buf); + std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + first, last, dst); + } + + template + device_vector(InputIterator first, + typename std::enable_if::value, + InputIterator>::type last) + : _storage(alloc_store(std::distance(first, last) * sizeof(T))), + _size(std::distance(first, last)) { + auto buf = get_buffer(); + Buffer tmp_buf(first, last); + auto start = oneapi::dpl::begin(tmp_buf); + auto end = oneapi::dpl::end(tmp_buf); + auto dst = oneapi::dpl::begin(buf); + std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + start, end, dst); + } + + template + device_vector(InputIterator first, + typename std::enable_if< + internal::is_iterator::value && + !std::is_same::iterator_category, + std::random_access_iterator_tag>::value, + InputIterator>::type last) + : _storage(alloc_store(std::distance(first, last) * sizeof(T))), + _size(std::distance(first, last)) { + auto buf = get_buffer(); + std::vector tmp(first, last); + Buffer tmp_buf(tmp); + auto start = oneapi::dpl::begin(tmp_buf); + auto end = oneapi::dpl::end(tmp_buf); + auto dst = oneapi::dpl::begin(buf); + std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + start, end, dst); + } + + template + device_vector(const device_vector &v) + : _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) { + auto buf = get_buffer(); + auto dst = oneapi::dpl::begin(buf); + std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + v.real_begin(), v.real_begin() + v.size(), dst); + } + + template + device_vector(std::vector &v) + : _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) { + std::copy(oneapi::dpl::execution::dpcpp_default, v.begin(), v.end(), + oneapi::dpl::begin(get_buffer())); + } + + device_vector &operator=(const device_vector &other) { + // Copy assignment operator: + _size = other.size(); + void *tmp = alloc_store(_size * sizeof(T)); + auto tmp_buf = + detail::mem_mgr::instance() + .translate_ptr(tmp) + .buffer.template reinterpret(sycl::range<1>(_size)); + std::copy(oneapi::dpl::execution::dpcpp_default, + oneapi::dpl::begin(other.get_buffer()), + oneapi::dpl::end(other.get_buffer()), + oneapi::dpl::begin(tmp_buf)); + detail::mem_mgr::instance().mem_free(_storage); + _storage = tmp; + return *this; + } + device_vector &operator=(device_vector &&other) { + // Move assignment operator: + _size = other.size(); + this->_storage = std::move(other._storage); + return *this; + } + template + device_vector &operator=(const std::vector &v) { + Buffer data(v.begin(), v.end()); + _size = v.size(); + void *tmp = alloc_store(_size * sizeof(T)); + auto tmp_buf = + detail::mem_mgr::instance() + .translate_ptr(tmp) + .buffer.template reinterpret(sycl::range<1>(_size)); + std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(data), + oneapi::dpl::end(data), oneapi::dpl::begin(tmp_buf)); + detail::mem_mgr::instance().mem_free(_storage); + _storage = tmp; + + return *this; + } + Buffer get_buffer() const { + return detail::mem_mgr::instance() + .translate_ptr(_storage) + .buffer.template reinterpret(sycl::range<1>(capacity())); + } + size_type size() const { return _size; } + iterator begin() noexcept { return device_iterator(get_buffer(), 0); } + iterator end() { return device_iterator(get_buffer(), _size); } + const_iterator begin() const noexcept { + return device_iterator(get_buffer(), 0); + } + const_iterator cbegin() const noexcept { return begin(); } + const_iterator end() const { return device_iterator(get_buffer(), _size); } + const_iterator cend() const { return end(); } + T *real_begin() { + return (detail::mem_mgr::instance() + .translate_ptr(_storage) + .buffer.template get_access()) + .get_pointer(); + } + const T *real_begin() const { + return const_cast(this) + ->detail::mem_mgr::instance() + .translate_ptr(_storage) + .buffer.template get_access() + .get_pointer(); + } + void swap(device_vector &v) { + void *temp = v._storage; + v._storage = this->_storage; + this->_storage = temp; + std::swap(_size, v._size); + } + reference operator[](size_type n) { return *(begin() + n); } + const_reference operator[](size_type n) const { return *(begin() + n); } + void reserve(size_type n) { + if (n > capacity()) { + // create new buffer (allocate for new size) + void *a = alloc_store(n * sizeof(T)); + + // copy content (old buffer to new buffer) + if (_storage != nullptr) { + auto tmp = detail::mem_mgr::instance() + .translate_ptr(a) + .buffer.template reinterpret(sycl::range<1>(n)); + auto src_buf = get_buffer(); + std::copy(oneapi::dpl::execution::dpcpp_default, + oneapi::dpl::begin(src_buf), oneapi::dpl::end(src_buf), + oneapi::dpl::begin(tmp)); + + // deallocate old memory + detail::mem_mgr::instance().mem_free(_storage); + } + _storage = a; + } + } + void resize(size_type new_size, const T &x = T()) { + reserve(new_size); + if (_size < new_size) { + auto src_buf = get_buffer(); + std::fill(oneapi::dpl::execution::dpcpp_default, + oneapi::dpl::begin(src_buf) + _size, + oneapi::dpl::begin(src_buf) + new_size, x); + } + _size = new_size; + } + size_type max_size(void) const { + return std::numeric_limits::max() / sizeof(T); + } + size_type capacity() const { + return _storage != nullptr ? detail::mem_mgr::instance() + .translate_ptr(_storage) + .buffer.size() / + sizeof(T) + : 0; + } + const_reference front() const { return *begin(); } + reference front() { return *begin(); } + const_reference back(void) const { return *(end() - 1); } + reference back(void) { return *(end() - 1); } + pointer data(void) { return reinterpret_cast(_storage); } + const_pointer data(void) const { + return reinterpret_cast(_storage); + } + void shrink_to_fit(void) { + if (_size != capacity()) { + void *a = alloc_store(_size * sizeof(T)); + auto tmp = detail::mem_mgr::instance() + .translate_ptr(a) + .buffer.template reinterpret(sycl::range<1>(_size)); + std::copy(oneapi::dpl::execution::dpcpp_default, + oneapi::dpl::begin(get_buffer()), + oneapi::dpl::begin(get_buffer()) + _size, + oneapi::dpl::begin(tmp)); + detail::mem_mgr::instance().mem_free(_storage); + _storage = a; + } + } + void assign(size_type n, const T &x) { + resize(n); + std::fill(oneapi::dpl::execution::dpcpp_default, begin(), begin() + n, x); + } + template + void + assign(InputIterator first, + typename std::enable_if::value, + InputIterator>::type last) { + auto n = std::distance(first, last); + resize(n); + if (internal::is_iterator::value && + !std::is_pointer::value) + std::copy(oneapi::dpl::execution::dpcpp_default, first, last, begin()); + else { + Buffer tmp(first, last); + std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp), + oneapi::dpl::end(tmp), begin()); + } + } + void clear(void) { + _size = 0; + detail::mem_mgr::instance().mem_free(_storage); + _storage = nullptr; + } + bool empty(void) const { return (size() == 0); } + void push_back(const T &x) { insert(end(), size_type(1), x); } + void pop_back(void) { + if (_size > 0) + --_size; + } + iterator erase(iterator first, iterator last) { + auto n = std::distance(first, last); + if (last == end()) { + _size = _size - n; + return end(); + } + Buffer tmp{Range(std::distance(last, end()))}; + // copy remainder to temporary buffer. + std::copy(oneapi::dpl::execution::dpcpp_default, last, end(), + oneapi::dpl::begin(tmp)); + // override (erase) subsequence in storage. + std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp), + oneapi::dpl::end(tmp), first); + resize(_size - n); + return begin() + first.get_idx() + n; + } + iterator erase(iterator pos) { return erase(pos, pos + 1); } + iterator insert(iterator position, const T &x) { + auto n = std::distance(begin(), position); + insert(position, size_type(1), x); + return begin() + n; + } + void insert(iterator position, size_type n, const T &x) { + if (position == end()) { + resize(size() + n); + std::fill(oneapi::dpl::execution::dpcpp_default, end() - n, end(), x); + } else { + auto i_n = std::distance(begin(), position); + // allocate temporary storage + Buffer tmp{Range(std::distance(position, end()))}; + // copy remainder + std::copy(oneapi::dpl::execution::dpcpp_default, position, end(), + oneapi::dpl::begin(tmp)); + + resize(size() + n); + // resizing might invalidate position + position = begin() + position.get_idx(); + + std::fill(oneapi::dpl::execution::dpcpp_default, position, position + n, + x); + + std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp), + oneapi::dpl::end(tmp), position + n); + } + } + template + void + insert(iterator position, InputIterator first, + typename std::enable_if::value, + InputIterator>::type last) { + auto n = std::distance(first, last); + if (position == end()) { + resize(size() + n); + std::copy(oneapi::dpl::execution::dpcpp_default, first, last, end()); + } else { + Buffer tmp{Range(std::distance(position, end()))}; + + std::copy(oneapi::dpl::execution::dpcpp_default, position, end(), + oneapi::dpl::begin(tmp)); + + resize(size() + n); + // resizing might invalidate position + position = begin() + position.get_idx(); + + std::copy(oneapi::dpl::execution::dpcpp_default, first, last, position); + std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp), + oneapi::dpl::end(tmp), position + n); + } + } +}; + +#endif + +} // end namespace dpct + +#endif diff --git a/dpct/dpl_utils.hpp b/dpct/dpl_utils.hpp new file mode 100644 index 0000000000000..79a6e74048f33 --- /dev/null +++ b/dpct/dpl_utils.hpp @@ -0,0 +1,26 @@ +//==---- dpl_utils.hpp ----------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_DPL_UTILS_HPP__ +#define __DPCT_DPL_UTILS_HPP__ + +#define ONEDPL_USE_DPCPP_BACKEND 1 +#define __USE_DPCT 1 + +#include +#include +#include + +#include "dpl_extras/memory.h" +#include "dpl_extras/algorithm.h" +#include "dpl_extras/numeric.h" +#include "dpl_extras/iterators.h" +#include "dpl_extras/vector.h" +#include "dpl_extras/dpcpp_extensions.h" + +#endif // __DPCT_DPL_UTILS_HPP__ diff --git a/dpct/fft_utils.hpp b/dpct/fft_utils.hpp new file mode 100644 index 0000000000000..cba1b253cecaf --- /dev/null +++ b/dpct/fft_utils.hpp @@ -0,0 +1,1376 @@ +//==---- fft_utils.hpp ----------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_FFT_UTILS_HPP__ +#define __DPCT_FFT_UTILS_HPP__ + +#include +#include +#include +#include +#include "lib_common_utils.hpp" + +namespace dpct { +namespace fft { +/// An enumeration type to describe the FFT direction is forward or backward. +enum fft_direction : int { + forward = 0, + backward +}; +/// An enumeration type to describe the types of FFT input and output data. +enum fft_type : int { + real_float_to_complex_float = 0, + complex_float_to_real_float, + real_double_to_complex_double, + complex_double_to_real_double, + complex_float_to_complex_float, + complex_double_to_complex_double, +}; + +/// A class to perform FFT calculation. +class fft_engine { +public: + /// Default constructor. + fft_engine() {} + /// Commit the configuration to calculate n-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] dim Dimension number of the data. + /// \param [in] n Pointer to an array containing each dimension's size. + /// \param [in] inembed Pointer to an array containing each dimension's size + /// of the embedded input data. + /// \param [in] istride Stride size of the input data. + /// \param [in] idist Distance between the two batches of the input data. + /// \param [in] input_type Input data type. + /// \param [in] onembed Pointer to an array containing each dimension's size + /// of the embedded output data. + /// \param [in] ostride Stride size of the output data. + /// \param [in] odist Distance between the two batches of the output data. + /// \param [in] output_type Output data type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [out] scratchpad_size The workspace size required for this FFT. + /// If this value is used to allocate memory, \p direction_and_placement need + /// to be specified explicitly to get correct result. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + void commit(sycl::queue *exec_queue, int dim, long long *n, + long long *inembed, long long istride, long long idist, + library_data_t input_type, long long *onembed, long long ostride, + long long odist, library_data_t output_type, long long batch, + size_t *scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + _q = exec_queue; + init(dim, n, inembed, istride, idist, input_type, onembed, + ostride, odist, output_type, batch, + direction_and_placement); + if (scratchpad_size) { + if (_is_estimate_call) + *scratchpad_size = _workspace_estimate_bytes; + else + *scratchpad_size = _workspace_bytes; + } + } + /// Commit the configuration to calculate n-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] dim Dimension number of the data. + /// \param [in] n Pointer to an array containing each dimension's size. + /// \param [in] inembed Pointer to an array containing each dimension's size + /// of the embedded input data. + /// \param [in] istride Stride size of the input data. + /// \param [in] idist Distance between the two batches of the input data. + /// \param [in] input_type Input data type. + /// \param [in] onembed Pointer to an array containing each dimension's size + /// of the embedded output data. + /// \param [in] ostride Stride size of the output data. + /// \param [in] odist Distance between the two batches of the output data. + /// \param [in] output_type Output data type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [out] scratchpad_size The workspace size required for this FFT. + /// If this value is used to allocate memory, \p direction_and_placement need + /// to be specified explicitly to get correct result. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + void commit(sycl::queue *exec_queue, int dim, int *n, int *inembed, + int istride, int idist, library_data_t input_type, int *onembed, + int ostride, int odist, library_data_t output_type, int batch, + size_t *scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + _q = exec_queue; + init(dim, n, inembed, istride, idist, input_type, onembed, ostride, + odist, output_type, batch, direction_and_placement); + if (scratchpad_size) { + if (_is_estimate_call) + *scratchpad_size = _workspace_estimate_bytes; + else + *scratchpad_size = _workspace_bytes; + } + } + /// Commit the configuration to calculate n-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] dim Dimension number of the data. + /// \param [in] n Pointer to an array containing each dimension's size. + /// \param [in] inembed Pointer to an array containing each dimension's size + /// of the embedded input data. + /// \param [in] istride Stride size of the input data. + /// \param [in] idist Distance between the two batches of the input data. + /// \param [in] onembed Pointer to an array containing each dimension's size + /// of the embedded output data. + /// \param [in] ostride Stride size of the output data. + /// \param [in] odist Distance between the two batches of the output data. + /// \param [in] type The FFT type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [out] scratchpad_size The workspace size required for this FFT. + /// If this value is used to allocate memory, \p direction_and_placement need + /// to be specified explicitly to get correct result. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + void commit(sycl::queue *exec_queue, int dim, long long *n, + long long *inembed, long long istride, long long idist, + long long *onembed, long long ostride, long long odist, + fft_type type, long long batch, size_t *scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + commit(exec_queue, dim, n, inembed, istride, idist, + fft_type_to_data_type(type).first, onembed, ostride, odist, + fft_type_to_data_type(type).second, batch, scratchpad_size, + direction_and_placement); + } + /// Commit the configuration to calculate n-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] dim Dimension number of the data. + /// \param [in] n Pointer to an array containing each dimension's size. + /// \param [in] inembed Pointer to an array containing each dimension's size + /// of the embedded input data. + /// \param [in] istride Stride size of the input data. + /// \param [in] idist Distance between the two batches of the input data. + /// \param [in] onembed Pointer to an array containing each dimension's size + /// of the embedded output data. + /// \param [in] ostride Stride size of the output data. + /// \param [in] odist Distance between the two batches of the output data. + /// \param [in] type The FFT type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [out] scratchpad_size The workspace size required for this FFT. + /// If this value is used to allocate memory, \p direction_and_placement need + /// to be specified explicitly to get correct result. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + void commit(sycl::queue *exec_queue, int dim, int *n, int *inembed, + int istride, int idist, int *onembed, int ostride, int odist, + fft_type type, int batch, size_t *scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + commit(exec_queue, dim, n, inembed, istride, idist, + fft_type_to_data_type(type).first, onembed, ostride, odist, + fft_type_to_data_type(type).second, batch, scratchpad_size, + direction_and_placement); + } + /// Commit the configuration to calculate 1-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] n1 The size of the dimension of the data. + /// \param [in] type The FFT type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [out] scratchpad_size The workspace size required for this FFT. + /// If this value is used to allocate memory, \p direction_and_placement need + /// to be specified explicitly to get correct result. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + void commit(sycl::queue *exec_queue, int n1, fft_type type, int batch, + size_t *scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + _q = exec_queue; + _n.resize(1); + _n[0] = n1; + std::tie(_input_type, _output_type) = fft_type_to_data_type(type); + _dim = 1; + _batch = batch; + _is_basic = true; + if (direction_and_placement.has_value()) { + _is_user_specified_dir_and_placement = true; + _direction = direction_and_placement->first; + _is_inplace = direction_and_placement->second; + } + config_and_commit_basic(); + if (scratchpad_size) { + if (_is_estimate_call) + *scratchpad_size = _workspace_estimate_bytes; + else + *scratchpad_size = _workspace_bytes; + } + } + /// Commit the configuration to calculate 2-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] n2 The size of the 2nd dimension (outermost) of the data. + /// \param [in] n1 The size of the 1st dimension (innermost) of the data. + /// \param [in] type The FFT type. + /// \param [out] scratchpad_size The workspace size required for this FFT. + /// If this value is used to allocate memory, \p direction_and_placement need + /// to be specified explicitly to get correct result. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + void commit(sycl::queue *exec_queue, int n2, int n1, fft_type type, + size_t *scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + _q = exec_queue; + _n.resize(2); + _n[0] = n2; + _n[1] = n1; + std::tie(_input_type, _output_type) = fft_type_to_data_type(type); + _dim = 2; + _is_basic = true; + if (direction_and_placement.has_value()) { + _is_user_specified_dir_and_placement = true; + _direction = direction_and_placement->first; + _is_inplace = direction_and_placement->second; + } + config_and_commit_basic(); + if (scratchpad_size) { + if (_is_estimate_call) + *scratchpad_size = _workspace_estimate_bytes; + else + *scratchpad_size = _workspace_bytes; + } + } + /// Commit the configuration to calculate 3-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] n3 The size of the 3rd dimension (outermost) of the data. + /// \param [in] n2 The size of the 2nd dimension of the data. + /// \param [in] n1 The size of the 1st dimension (innermost) of the data. + /// \param [in] type The FFT type. + /// \param [out] scratchpad_size The workspace size required for this FFT. + /// If this value is used to allocate memory, \p direction_and_placement need + /// to be specified explicitly to get correct result. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + void commit(sycl::queue *exec_queue, int n3, int n2, int n1, fft_type type, + size_t *scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + _q = exec_queue; + _n.resize(3); + _n[0] = n3; + _n[1] = n2; + _n[2] = n1; + std::tie(_input_type, _output_type) = fft_type_to_data_type(type); + _dim = 3; + _is_basic = true; + if (direction_and_placement.has_value()) { + _is_user_specified_dir_and_placement = true; + _direction = direction_and_placement->first; + _is_inplace = direction_and_placement->second; + } + config_and_commit_basic(); + if (scratchpad_size) { + if (_is_estimate_call) + *scratchpad_size = _workspace_estimate_bytes; + else + *scratchpad_size = _workspace_bytes; + } + } + + /// Create the class for calculate 1-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] n1 The size of the dimension of the data. + /// \param [in] type The FFT type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + static fft_engine * + create(sycl::queue *exec_queue, int n1, fft_type type, int batch, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = new fft_engine(); + engine->commit(exec_queue, n1, type, batch, nullptr, + direction_and_placement); + return engine; + } + /// Create the class for calculate 2-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] n2 The size of the 2nd dimension (outermost) of the data. + /// \param [in] n1 The size of the 1st dimension (innermost) of the data. + /// \param [in] type The FFT type. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + static fft_engine * + create(sycl::queue *exec_queue, int n2, int n1, fft_type type, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = new fft_engine(); + engine->commit(exec_queue, n2, n1, type, nullptr, direction_and_placement); + return engine; + } + /// Create the class for calculate 3-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] n3 The size of the 3rd dimension (outermost) of the data. + /// \param [in] n2 The size of the 2nd dimension of the data. + /// \param [in] n1 The size of the 1st dimension (innermost) of the data. + /// \param [in] type The FFT type. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + static fft_engine * + create(sycl::queue *exec_queue, int n3, int n2, int n1, fft_type type, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = new fft_engine(); + engine->commit(exec_queue, n3, n2, n1, type, nullptr, + direction_and_placement); + return engine; + } + /// Create the class for calculate n-D FFT. + /// \param [in] exec_queue The queue where the calculation should be executed. + /// \param [in] dim Dimension number of the data. + /// \param [in] n Pointer to an array containing each dimension's size. + /// \param [in] inembed Pointer to an array containing each dimension's size + /// of the embedded input data. + /// \param [in] istride Stride size of the input data. + /// \param [in] idist Distance between the two batches of the input data. + /// \param [in] onembed Pointer to an array containing each dimension's size + /// of the embedded output data. + /// \param [in] ostride Stride size of the output data. + /// \param [in] odist Distance between the two batches of the output data. + /// \param [in] type The FFT type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If this value is specified, the direction parameter + /// will be ignored in the fft_engine::compute function. If it is not set, + /// forward direction(if current FFT is complex-to-complex) and out-of-place + /// (false) are set by default. + static fft_engine * + create(sycl::queue *exec_queue, int dim, int *n, int *inembed, int istride, + int idist, int *onembed, int ostride, int odist, fft_type type, + int batch, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = new fft_engine(); + engine->commit(exec_queue, dim, n, inembed, istride, idist, onembed, + ostride, odist, type, batch, nullptr, + direction_and_placement); + return engine; + } + /// Create the class for calculate FFT without commit any config. + static fft_engine *create() { + fft_engine *engine = new fft_engine(); + return engine; + } + /// Destroy the class for calculate FFT. + /// \param [in] engine Pointer returned from fft_engine::craete. + static void destroy(fft_engine *engine) { delete engine; } + +#ifdef __INTEL_MKL__ + /// Estimates the workspace size for calculating n-D FFT. + /// \param [in] dim Dimension number of the data. + /// \param [in] n Pointer to an array containing each dimension's size. + /// \param [in] inembed Pointer to an array containing each dimension's size + /// of the embedded input data. + /// \param [in] istride Stride size of the input data. + /// \param [in] idist Distance between the two batches of the input data. + /// \param [in] onembed Pointer to an array containing each dimension's size + /// of the embedded output data. + /// \param [in] ostride Stride size of the output data. + /// \param [in] odist Distance between the two batches of the output data. + /// \param [in] type The FFT type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [out] estimated_scratchpad_size The estimated workspace size + /// required for this FFT. If this value is used to allocate memory, + /// \p direction_and_placement need to be specified explicitly to get correct + /// result. + /// \param [in] direction_and_placement Explicitly specify the FFT + /// direction and placement info. If it is not set, forward direction(if + /// current FFT is complex-to-complex) and out-of-place (false) are set by default. + static void + estimate_size(int dim, long long *n, long long *inembed, long long istride, + long long idist, long long *onembed, long long ostride, + long long odist, fft_type type, long long batch, + size_t *estimated_scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = fft_engine::create(); + engine->_is_estimate_call = true; + engine->commit(&dpct::get_default_queue(), dim, n, inembed, istride, idist, + fft_type_to_data_type(type).first, onembed, ostride, odist, + fft_type_to_data_type(type).second, batch, + estimated_scratchpad_size, direction_and_placement); + fft_engine::destroy(engine); + } + /// Estimates the workspace size for calculating n-D FFT. + /// \param [in] dim Dimension number of the data. + /// \param [in] n Pointer to an array containing each dimension's size. + /// \param [in] inembed Pointer to an array containing each dimension's size + /// of the embedded input data. + /// \param [in] istride Stride size of the input data. + /// \param [in] idist Distance between the two batches of the input data. + /// \param [in] onembed Pointer to an array containing each dimension's size + /// of the embedded output data. + /// \param [in] ostride Stride size of the output data. + /// \param [in] odist Distance between the two batches of the output data. + /// \param [in] type The FFT type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [out] estimated_scratchpad_size The estimated workspace size + /// required for this FFT. If this value is used to allocate memory, + /// \p direction_and_placement need to be specified explicitly to get correct + /// result. + /// \param [in] direction_and_placement Explicitly specify the FFT + /// direction and placement info. If it is not set, forward direction(if + /// current FFT is complex-to-complex) and out-of-place (false) are set by default. + static void + estimate_size(int dim, int *n, int *inembed, int istride, int idist, + int *onembed, int ostride, int odist, fft_type type, int batch, + size_t *estimated_scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = fft_engine::create(); + engine->_is_estimate_call = true; + engine->commit(&dpct::get_default_queue(), dim, n, inembed, istride, idist, + fft_type_to_data_type(type).first, onembed, ostride, odist, + fft_type_to_data_type(type).second, batch, + estimated_scratchpad_size, direction_and_placement); + fft_engine::destroy(engine); + } + /// Estimates the workspace size for calculating 1-D FFT. + /// \param [in] n1 The size of the dimension of the data. + /// \param [in] type The FFT type. + /// \param [in] batch The number of FFT operations to perform. + /// \param [out] estimated_scratchpad_size The estimated workspace size + /// required for this FFT. If this value is used to allocate memory, + /// \p direction_and_placement need to be specified explicitly to get correct + /// result. + /// \param [in] direction_and_placement Explicitly specify the FFT direction + /// and placement info. If it is not set, forward direction(if current FFT is + /// complex-to-complex) and out-of-place (false) are set by default. + static void + estimate_size(int n1, fft_type type, int batch, + size_t *estimated_scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = fft_engine::create(); + engine->_is_estimate_call = true; + engine->commit(&dpct::get_default_queue(), n1, type, batch, + estimated_scratchpad_size, direction_and_placement); + fft_engine::destroy(engine); + } + /// Estimates the workspace size for calculating 2-D FFT. + /// \param [in] n2 The size of the 2nd dimension (outermost) of the data. + /// \param [in] n1 The size of the 1st dimension (innermost) of the data. + /// \param [in] type The FFT type. + /// \param [out] estimated_scratchpad_size The estimated workspace size + /// required for this FFT. If this value is used to allocate memory, + /// \p direction_and_placement need to be specified explicitly to get correct + /// result. + /// \param [in] direction_and_placement Explicitly specify the FFT + /// direction and placement info. If it is not set, forward direction(if + /// current FFT is complex-to-complex) and out-of-place (false) are set by default. + static void + estimate_size(int n2, int n1, fft_type type, + size_t *estimated_scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = fft_engine::create(); + engine->_is_estimate_call = true; + engine->commit(&dpct::get_default_queue(), n2, n1, type, + estimated_scratchpad_size, direction_and_placement); + fft_engine::destroy(engine); + } + /// Estimates the workspace size for calculating 3-D FFT. + /// \param [in] n3 The size of the 3rd dimension (outermost) of the data. + /// \param [in] n2 The size of the 2nd dimension of the data. + /// \param [in] n1 The size of the 1st dimension (innermost) of the data. + /// \param [in] type The FFT type. + /// \param [out] estimated_scratchpad_size The estimated workspace size + /// required for this FFT. If this value is used to allocate memory, + /// \p direction_and_placement need to be specified explicitly to get correct + /// result. + /// \param [in] direction_and_placement Explicitly specify the FFT + /// direction and placement info. If it is not set, forward direction(if + /// current FFT is complex-to-complex) and out-of-place (false) are set by default. + static void + estimate_size(int n3, int n2, int n1, fft_type type, + size_t *estimated_scratchpad_size, + std::optional> + direction_and_placement = std::nullopt) { + fft_engine *engine = fft_engine::create(); + engine->_is_estimate_call = true; + engine->commit(&dpct::get_default_queue(), n3, n2, n1, type, + estimated_scratchpad_size, direction_and_placement); + fft_engine::destroy(engine); + } +#endif + + /// Execute the FFT calculation. + /// \param [in] input Pointer to the input data. + /// \param [out] output Pointer to the output data. + /// \param [in] direction The FFT direction. + template + void compute(input_t *input, output_t *output, fft_direction direction) { + if (_input_type == library_data_t::complex_float && + _output_type == library_data_t::complex_float) { + compute_complex( + (float *)input, (float *)output, direction); + } else if (_input_type == library_data_t::complex_double && + _output_type == library_data_t::complex_double) { + compute_complex( + (double *)input, (double *)output, direction); + } else if (_input_type == library_data_t::real_float && + _output_type == library_data_t::complex_float) { + _direction = direction; + compute_real((float *)input, + (float *)output); + } else if (_input_type == library_data_t::complex_float && + _output_type == library_data_t::real_float) { + _direction = direction; + compute_real((float *)input, + (float *)output); + } else if (_input_type == library_data_t::real_double && + _output_type == library_data_t::complex_double) { + _direction = direction; + compute_real( + (double *)input, (double *)output); + } else if (_input_type == library_data_t::complex_double && + _output_type == library_data_t::real_double) { + _direction = direction; + compute_real( + (double *)input, (double *)output); + } + } + template <> + void compute(float *input, sycl::float2 *output, fft_direction direction) { + _direction = direction; + compute_real((float *)input, + (float *)output); + } + template <> + void compute(sycl::float2 *input, float *output, fft_direction direction) { + _direction = direction; + compute_real((float *)input, + (float *)output); + } + template <> + void compute(double *input, sycl::double2 *output, fft_direction direction) { + _direction = direction; + compute_real((double *)input, + (double *)output); + } + template <> + void compute(sycl::double2 *input, double *output, fft_direction direction) { + _direction = direction; + compute_real((double *)input, + (double *)output); + } + template <> + void compute(sycl::float2 *input, sycl::float2 *output, + fft_direction direction) { + compute_complex( + (float *)input, (float *)output, direction); + } + template <> + void compute(sycl::double2 *input, sycl::double2 *output, + fft_direction direction) { + compute_complex( + (double *)input, (double *)output, direction); + } + /// Setting the user's SYCL queue for calculation. + /// \param [in] q Pointer to the SYCL queue. + void set_queue(sycl::queue *q) { _q = q; } +#ifdef __INTEL_MKL__ + /// Setting whether to use external or internal workspace. + /// \param [in] flag True means using internal workspace. False means using + /// external workspace. + void use_internal_workspace(bool flag = true) { + _use_external_workspace = !flag; + } + /// Specify the external workspace. + /// \param [in] ptr Pointer to the workspace. + void set_workspace(void *ptr) { + if (!_use_external_workspace) { + return; + } + if (_input_type == library_data_t::complex_float && + _output_type == library_data_t::complex_float) { + if (_q->get_device().is_gpu()) { + auto data = dpct::detail::get_memory(ptr); + _desc_sc->set_workspace(data); + } + } else if (_input_type == library_data_t::complex_double && + _output_type == library_data_t::complex_double) { + if (_q->get_device().is_gpu()) { + auto data = dpct::detail::get_memory(ptr); + _desc_dc->set_workspace(data); + } + } else if ((_input_type == library_data_t::real_float && + _output_type == library_data_t::complex_float) || + (_input_type == library_data_t::complex_float && + _output_type == library_data_t::real_float)) { + if (_q->get_device().is_gpu()) { + auto data = dpct::detail::get_memory(ptr); + _desc_sr->set_workspace(data); + } + } else if ((_input_type == library_data_t::real_double && + _output_type == library_data_t::complex_double) || + (_input_type == library_data_t::complex_double && + _output_type == library_data_t::real_double)) { + if (_q->get_device().is_gpu()) { + auto data = dpct::detail::get_memory(ptr); + _desc_dr->set_workspace(data); + } + } else { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "invalid fft type"); + } + } +#endif + /// Get the workspace size. + /// \param [out] scratchpad_size Workspace size in bytes. + void get_workspace_size(size_t *scratchpad_size) { + if (scratchpad_size) { + *scratchpad_size = _workspace_bytes; + } + } + +private: + static std::pair + fft_type_to_data_type(fft_type type) { + switch (type) { + case fft_type::real_float_to_complex_float: { + return std::make_pair(library_data_t::real_float, + library_data_t::complex_float); + } + case fft_type::complex_float_to_real_float: { + return std::make_pair(library_data_t::complex_float, + library_data_t::real_float); + } + case fft_type::real_double_to_complex_double: { + return std::make_pair(library_data_t::real_double, + library_data_t::complex_double); + } + case fft_type::complex_double_to_real_double: { + return std::make_pair(library_data_t::complex_double, + library_data_t::real_double); + } + case fft_type::complex_float_to_complex_float: { + return std::make_pair(library_data_t::complex_float, + library_data_t::complex_float); + } + case fft_type::complex_double_to_complex_double: { + return std::make_pair(library_data_t::complex_double, + library_data_t::complex_double); + } + } + } + + void config_and_commit_basic() { + if (_input_type == library_data_t::complex_float && + _output_type == library_data_t::complex_float) { + _desc_sc = std::make_shared< + oneapi::mkl::dft::descriptor>(_n); + std::int64_t distance = 1; + for (auto i : _n) + distance = distance * i; + _fwd_dist = distance; + _bwd_dist = distance; + _desc_sc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, + distance); + _desc_sc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, + distance); + _desc_sc->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + _batch); +#ifdef __INTEL_MKL__ + if (_is_user_specified_dir_and_placement && _is_inplace) + _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_INPLACE); + else + _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); + if (_use_external_workspace) { + if (_q->get_device().is_gpu()) { + _desc_sc->set_value( + oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + } + } + if (_is_estimate_call) { + if (_q->get_device().is_gpu()) { + _desc_sc->get_value( + oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES, + &_workspace_estimate_bytes); + } + } else { + _desc_sc->commit(*_q); + if (_q->get_device().is_gpu()) { + _desc_sc->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES, + &_workspace_bytes); + } + } +#else + if (_is_user_specified_dir_and_placement && _is_inplace) + _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::INPLACE); + else + _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::NOT_INPLACE); + _desc_sc->commit(*_q); +#endif + } else if (_input_type == library_data_t::complex_double && + _output_type == library_data_t::complex_double) { + _desc_dc = std::make_shared< + oneapi::mkl::dft::descriptor>(_n); + std::int64_t distance = 1; + for (auto i : _n) + distance = distance * i; + _fwd_dist = distance; + _bwd_dist = distance; + _desc_dc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, + distance); + _desc_dc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, + distance); + _desc_dc->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + _batch); +#ifdef __INTEL_MKL__ + if (_is_user_specified_dir_and_placement && _is_inplace) + _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_INPLACE); + else + _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); + if (_use_external_workspace) { + if (_q->get_device().is_gpu()) { + _desc_dc->set_value( + oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + } + } + if (_is_estimate_call) { + if (_q->get_device().is_gpu()) { + _desc_dc->get_value( + oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES, + &_workspace_estimate_bytes); + } + } else { + _desc_dc->commit(*_q); + if (_q->get_device().is_gpu()) { + _desc_dc->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES, + &_workspace_bytes); + } + } +#else + if (_is_user_specified_dir_and_placement && _is_inplace) + _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::INPLACE); + else + _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::NOT_INPLACE); + _desc_dc->commit(*_q); +#endif + } else if ((_input_type == library_data_t::real_float && + _output_type == library_data_t::complex_float) || + (_input_type == library_data_t::complex_float && + _output_type == library_data_t::real_float)) { + _desc_sr = std::make_shared>( + _n); + if (_input_type == library_data_t::real_float && + _output_type == library_data_t::complex_float) + _direction = fft_direction::forward; + else + _direction = fft_direction::backward; + _desc_sr->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + _batch); +#ifdef __INTEL_MKL__ + if (_is_user_specified_dir_and_placement && _is_inplace) { + _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_INPLACE); + set_stride_and_distance_basic(_desc_sr); + } else { + _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); + set_stride_and_distance_basic(_desc_sr); + } + if (_use_external_workspace) { + if (_q->get_device().is_gpu()) { + _desc_sr->set_value( + oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + } + } + if (_is_estimate_call) { + if (_q->get_device().is_gpu()) { + _desc_sr->get_value( + oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES, + &_workspace_estimate_bytes); + } + } else { + _desc_sr->commit(*_q); + if (_q->get_device().is_gpu()) { + _desc_sr->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES, + &_workspace_bytes); + } + } +#else + if (_is_user_specified_dir_and_placement && _is_inplace) { + _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::INPLACE); + set_stride_and_distance_basic(_desc_sr); + } else { + _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::NOT_INPLACE); + set_stride_and_distance_basic(_desc_sr); + } + _desc_sr->commit(*_q); +#endif + } else if ((_input_type == library_data_t::real_double && + _output_type == library_data_t::complex_double) || + (_input_type == library_data_t::complex_double && + _output_type == library_data_t::real_double)) { + _desc_dr = std::make_shared>( + _n); + if (_input_type == library_data_t::real_double && + _output_type == library_data_t::complex_double) + _direction = fft_direction::forward; + else + _direction = fft_direction::backward; + _desc_dr->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + _batch); +#ifdef __INTEL_MKL__ + if (_is_user_specified_dir_and_placement && _is_inplace) { + _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_INPLACE); + set_stride_and_distance_basic(_desc_dr); + } else { + _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); + set_stride_and_distance_basic(_desc_dr); + } + if (_use_external_workspace) { + if (_q->get_device().is_gpu()) { + _desc_dr->set_value( + oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + } + } + if (_is_estimate_call) { + if (_q->get_device().is_gpu()) { + _desc_dr->get_value( + oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES, + &_workspace_estimate_bytes); + } + } else { + _desc_dr->commit(*_q); + if (_q->get_device().is_gpu()) { + _desc_dr->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES, + &_workspace_bytes); + } + } +#else + if (_is_user_specified_dir_and_placement && _is_inplace) { + _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::INPLACE); + set_stride_and_distance_basic(_desc_dr); + } else { + _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::NOT_INPLACE); + set_stride_and_distance_basic(_desc_dr); + } + _desc_dr->commit(*_q); +#endif + } else { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "invalid fft type"); + } + } + + void config_and_commit_advanced() { +#ifdef __INTEL_MKL__ +#define CONFIG_AND_COMMIT(DESC, PREC, DOM, TYPE) \ + { \ + DESC = std::make_shared>( \ + _n); \ + set_stride_advanced(DESC); \ + DESC->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _fwd_dist); \ + DESC->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _bwd_dist); \ + DESC->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, \ + _batch); \ + if (_is_user_specified_dir_and_placement && _is_inplace) \ + DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT, \ + DFTI_CONFIG_VALUE::DFTI_INPLACE); \ + else \ + DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT, \ + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); \ + if (_use_external_workspace) { \ + DESC->set_value(oneapi::mkl::dft::config_param::WORKSPACE, \ + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); \ + } \ + if (_is_estimate_call) { \ + if (_q->get_device().is_gpu()) { \ + DESC->get_value( \ + oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES, \ + &_workspace_estimate_bytes); \ + } \ + } else { \ + DESC->commit(*_q); \ + if (_is_estimate_call) { \ + DESC->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES, \ + &_workspace_bytes); \ + } \ + } \ + } +#else +#define CONFIG_AND_COMMIT(DESC, PREC, DOM, TYPE) \ + { \ + DESC = std::make_shared>( \ + _n); \ + set_stride_advanced(DESC); \ + DESC->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _fwd_dist); \ + DESC->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _bwd_dist); \ + DESC->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, \ + _batch); \ + if (_is_user_specified_dir_and_placement && _is_inplace) \ + DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT, \ + oneapi::mkl::dft::config_value::INPLACE); \ + else \ + DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT, \ + oneapi::mkl::dft::config_value::NOT_INPLACE); \ + DESC->commit(*_q); \ + } +#endif + + if (_input_type == library_data_t::complex_float && + _output_type == library_data_t::complex_float) { + CONFIG_AND_COMMIT(_desc_sc, SINGLE, COMPLEX, float); + } else if (_input_type == library_data_t::complex_double && + _output_type == library_data_t::complex_double) { + CONFIG_AND_COMMIT(_desc_dc, DOUBLE, COMPLEX, double); + } else if ((_input_type == library_data_t::real_float && + _output_type == library_data_t::complex_float) || + (_input_type == library_data_t::complex_float && + _output_type == library_data_t::real_float)) { + CONFIG_AND_COMMIT(_desc_sr, SINGLE, REAL, float); + } else if ((_input_type == library_data_t::real_double && + _output_type == library_data_t::complex_double) || + (_input_type == library_data_t::complex_double && + _output_type == library_data_t::real_double)) { + CONFIG_AND_COMMIT(_desc_dr, DOUBLE, REAL, double); + } else { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "invalid fft type"); + } +#undef CONFIG_AND_COMMIT + } + + template + void init(int dim, T *n, T *inembed, T istride, T idist, + library_data_t input_type, T *onembed, T ostride, T odist, + library_data_t output_type, T batch, + std::optional> + direction_and_placement) { + if (direction_and_placement.has_value()) { + _is_user_specified_dir_and_placement = true; + _direction = direction_and_placement->first; + _is_inplace = direction_and_placement->second; + } + _n.resize(dim); + _inembed.resize(dim); + _onembed.resize(dim); + _input_type = input_type; + _output_type = output_type; + for (int i = 0; i < dim; i++) { + _n[i] = n[i]; + } + if (inembed && onembed) { + for (int i = 0; i < dim; i++) { + _inembed[i] = inembed[i]; + _onembed[i] = onembed[i]; + } + _istride = istride; + _ostride = ostride; + + if ((_input_type == library_data_t::real_float && + _output_type == library_data_t::complex_float) || + (_input_type == library_data_t::real_double && + _output_type == library_data_t::complex_double)) { + _fwd_dist = idist; + _bwd_dist = odist; + } else if ((_output_type == library_data_t::real_float && + _input_type == library_data_t::complex_float) || + (_output_type == library_data_t::real_double && + _input_type == library_data_t::complex_double)) { + _fwd_dist = odist; + _bwd_dist = idist; + } else { + if (_is_user_specified_dir_and_placement && + (_direction == fft_direction::backward)) { + _fwd_dist = odist; + _bwd_dist = idist; + } else { + _fwd_dist = idist; + _bwd_dist = odist; + } + } + } else { + _is_basic = true; + } + _batch = batch; + _dim = dim; + + if (_is_basic) + config_and_commit_basic(); + else + config_and_commit_advanced(); + } + template + void set_stride_advanced(std::shared_ptr desc) { + if (_dim == 1) { + std::int64_t input_stride[2] = {0, _istride}; + std::int64_t output_stride[2] = {0, _ostride}; + desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, + input_stride); + desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, + output_stride); + } else if (_dim == 2) { + std::int64_t input_stride[3] = {0, _inembed[1] * _istride, _istride}; + std::int64_t output_stride[3] = {0, _onembed[1] * _ostride, _ostride}; + desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, + input_stride); + desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, + output_stride); + } else if (_dim == 3) { + std::int64_t input_stride[4] = {0, _inembed[2] * _inembed[1] * _istride, + _inembed[2] * _istride, _istride}; + std::int64_t output_stride[4] = {0, _onembed[2] * _onembed[1] * _ostride, + _onembed[2] * _ostride, _ostride}; + desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, + input_stride); + desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, + output_stride); + } + } + + template void swap_distance(std::shared_ptr desc) { + desc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _bwd_dist); + desc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _fwd_dist); + std::int64_t temp = _bwd_dist; + _bwd_dist = _fwd_dist; + _fwd_dist = temp; + } + + template + void set_stride_and_distance_basic(std::shared_ptr desc) { + std::int64_t forward_distance = 0; + std::int64_t backward_distance = 0; + +#define SET_STRIDE \ + { \ + if (_direction == fft_direction::forward) { \ + desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, \ + real_stride); \ + desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, \ + complex_stride); \ + } else { \ + desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES, \ + complex_stride); \ + desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES, \ + real_stride); \ + } \ + } + if (_dim == 1) { + if constexpr (Is_inplace) { + std::int64_t real_stride[2] = {0, 1}; + std::int64_t complex_stride[2] = {0, 1}; + SET_STRIDE; + forward_distance = 2 * (_n[0] / 2 + 1); + backward_distance = _n[0] / 2 + 1; + } else { + std::int64_t real_stride[2] = {0, 1}; + std::int64_t complex_stride[2] = {0, 1}; + SET_STRIDE; + forward_distance = _n[0]; + backward_distance = _n[0] / 2 + 1; + } + } else if (_dim == 2) { + if constexpr (Is_inplace) { + std::int64_t complex_stride[3] = {0, _n[1] / 2 + 1, 1}; + std::int64_t real_stride[3] = {0, 2 * (_n[1] / 2 + 1), 1}; + SET_STRIDE; + forward_distance = _n[0] * 2 * (_n[1] / 2 + 1); + backward_distance = _n[0] * (_n[1] / 2 + 1); + } else { + std::int64_t complex_stride[3] = {0, _n[1] / 2 + 1, 1}; + std::int64_t real_stride[3] = {0, _n[1], 1}; + SET_STRIDE; + forward_distance = _n[0] * _n[1]; + backward_distance = _n[0] * (_n[1] / 2 + 1); + } + } else if (_dim == 3) { + if constexpr (Is_inplace) { + std::int64_t complex_stride[4] = {0, _n[1] * (_n[2] / 2 + 1), + _n[2] / 2 + 1, 1}; + std::int64_t real_stride[4] = {0, _n[1] * 2 * (_n[2] / 2 + 1), + 2 * (_n[2] / 2 + 1), 1}; + SET_STRIDE; + forward_distance = _n[0] * _n[1] * 2 * (_n[2] / 2 + 1); + backward_distance = _n[0] * _n[1] * (_n[2] / 2 + 1); + } else { + std::int64_t complex_stride[4] = {0, _n[1] * (_n[2] / 2 + 1), + _n[2] / 2 + 1, 1}; + std::int64_t real_stride[4] = {0, _n[1] * _n[2], _n[2], 1}; + SET_STRIDE; + forward_distance = _n[0] * _n[1] * _n[2]; + backward_distance = _n[0] * _n[1] * (_n[2] / 2 + 1); + } + } +#undef SET_STRIDE + desc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, + forward_distance); + desc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, + backward_distance); + } + +#define COMPUTE(DESC) \ + { \ + if (_is_inplace) { \ + auto data_input = dpct::detail::get_memory(input); \ + if (_direction == fft_direction::forward) { \ + oneapi::mkl::dft::compute_forward< \ + std::remove_reference_t, T>(*DESC, data_input); \ + } else { \ + oneapi::mkl::dft::compute_backward< \ + std::remove_reference_t, T>(*DESC, data_input); \ + } \ + } else { \ + auto data_input = dpct::detail::get_memory(input); \ + auto data_output = dpct::detail::get_memory(output); \ + if (_direction == fft_direction::forward) { \ + oneapi::mkl::dft::compute_forward< \ + std::remove_reference_t, T, T>(*DESC, data_input, \ + data_output); \ + } else { \ + oneapi::mkl::dft::compute_backward< \ + std::remove_reference_t, T, T>(*DESC, data_input, \ + data_output); \ + } \ + } \ + } + + template + void compute_complex(T *input, T *output, fft_direction direction) { + bool is_this_compute_inplace = input == output; + + if (!_is_user_specified_dir_and_placement) { + // The complex domain descriptor need different config values if the + // FFT direction or placement is different. + // Here we check the conditions, and new config values are set and + // re-committed if needed. + if (direction != _direction || is_this_compute_inplace != _is_inplace) { + if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) { + if (direction != _direction) { + swap_distance(_desc_sc); + _direction = direction; + } + if (is_this_compute_inplace != _is_inplace) { + _is_inplace = is_this_compute_inplace; +#ifdef __INTEL_MKL__ + if (_is_inplace) { + _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_INPLACE); + } else { + _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); + } +#else + if (_is_inplace) { + _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::INPLACE); + } else { + _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::NOT_INPLACE); + } +#endif + } + _desc_sc->commit(*_q); + } else { + if (direction != _direction) { + swap_distance(_desc_dc); + _direction = direction; + } + if (is_this_compute_inplace != _is_inplace) { + _is_inplace = is_this_compute_inplace; +#ifdef __INTEL_MKL__ + if (_is_inplace) { + _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_INPLACE); + } else { + _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); + } +#else + if (_is_inplace) { + _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::INPLACE); + } else { + _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::NOT_INPLACE); + } +#endif + } + _desc_dc->commit(*_q); + } + } + } + + if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) { + COMPUTE(_desc_sc); + } else { + COMPUTE(_desc_dc); + } + } + + template + void compute_real(T *input, T *output) { + bool is_this_compute_inplace = input == output; + + if (!_is_user_specified_dir_and_placement) { + // The real domain descriptor need different config values if the + // FFT placement is different. + // Here we check the condition, and new config values are set and + // re-committed if needed. + if (is_this_compute_inplace != _is_inplace) { + if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) { + _is_inplace = is_this_compute_inplace; + if (_is_inplace) { +#ifdef __INTEL_MKL__ + _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_INPLACE); +#else + _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::INPLACE); +#endif + if (_is_basic) + set_stride_and_distance_basic(_desc_sr); + } else { +#ifdef __INTEL_MKL__ + _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); +#else + _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::NOT_INPLACE); +#endif + if (_is_basic) + set_stride_and_distance_basic(_desc_sr); + } + _desc_sr->commit(*_q); + } else { + _is_inplace = is_this_compute_inplace; + if (_is_inplace) { +#ifdef __INTEL_MKL__ + _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_INPLACE); +#else + _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::INPLACE); +#endif + if (_is_basic) + set_stride_and_distance_basic(_desc_dr); + } else { +#ifdef __INTEL_MKL__ + _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE); +#else + _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + oneapi::mkl::dft::config_value::NOT_INPLACE); +#endif + if (_is_basic) + set_stride_and_distance_basic(_desc_dr); + } + _desc_dr->commit(*_q); + } + } + } + + if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) { + COMPUTE(_desc_sr); + } else { + COMPUTE(_desc_dr); + } + } +#undef COMPUTE + +private: + sycl::queue *_q = nullptr; + int _dim; + std::vector _n; + std::vector _inembed; + std::int64_t _istride; + std::int64_t _fwd_dist; + library_data_t _input_type; + std::vector _onembed; + std::int64_t _ostride; + std::int64_t _bwd_dist; + library_data_t _output_type; + std::int64_t _batch = 1; + bool _is_basic = false; + bool _is_inplace = false; + fft_direction _direction = fft_direction::forward; + bool _is_user_specified_dir_and_placement = false; + bool _use_external_workspace = false; + void *_external_workspace_ptr = nullptr; + size_t _workspace_bytes = 0; + bool _is_estimate_call = false; + size_t _workspace_estimate_bytes = 0; + std::shared_ptr> + _desc_sr; + std::shared_ptr> + _desc_dr; + std::shared_ptr> + _desc_sc; + std::shared_ptr> + _desc_dc; +}; + +using fft_engine_ptr = fft_engine *; +} // namespace fft +} // namespace dpct + +#endif // __DPCT_FFT_UTILS_HPP__ diff --git a/dpct/image.hpp b/dpct/image.hpp new file mode 100644 index 0000000000000..b9bb246685e7b --- /dev/null +++ b/dpct/image.hpp @@ -0,0 +1,901 @@ +//==---- image.hpp --------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_IMAGE_HPP__ +#define __DPCT_IMAGE_HPP__ + +#include + +#include "memory.hpp" +#include "util.hpp" + +namespace dpct { + +enum class image_channel_data_type { + signed_int, + unsigned_int, + fp, +}; + +class image_channel; +class image_wrapper_base; +namespace detail { +/// Image object type traits, with accessor type and sampled data type defined. +/// The data type of an image accessor must be one of sycl::int4, sycl::uint4, +/// sycl::float4 and sycl::half4. The data type of accessors with 8bits/16bits +/// channel width will be 32 bits. sycl::half is an exception. +template struct image_trait { + using acc_data_t = sycl::vec; + template + using accessor_t = + sycl::accessor; + template + using array_accessor_t = + sycl::accessor; + using data_t = T; + using elem_t = T; + static constexpr image_channel_data_type data_type = + std::is_integral::value + ? (std::is_signed::value ? image_channel_data_type::signed_int + : image_channel_data_type::unsigned_int) + : image_channel_data_type::fp; + static constexpr int channel_num = 1; +}; +template <> +struct image_trait : public image_trait { + using data_t = std::uint8_t; + using elem_t = data_t; +}; +template <> +struct image_trait + : public image_trait { + using data_t = std::uint16_t; + using elem_t = data_t; +}; +template <> +struct image_trait : public image_trait { + using data_t = std::int8_t; + using elem_t = data_t; +}; +template <> +struct image_trait : public image_trait { + using data_t = std::int16_t; + using elem_t = data_t; +}; +template <> +struct image_trait + : public image_trait::value, signed char, unsigned char>::type> {}; + +template +struct image_trait> : public image_trait {}; + +template +struct image_trait> : public image_trait { + using data_t = sycl::vec; + static constexpr int channel_num = 2; +}; + +template +struct image_trait> + : public image_trait> { + static constexpr int channel_num = 3; +}; + +template +struct image_trait> : public image_trait { + using data_t = sycl::vec; + static constexpr int channel_num = 4; +}; + +/// Functor to fetch data from read result of an image accessor. +template struct fetch_data { + using return_t = typename image_trait::data_t; + using acc_data_t = typename image_trait::acc_data_t; + + return_t operator()(acc_data_t &&original_data) { + return (return_t)original_data.r(); + } +}; +template +struct fetch_data> : public fetch_data {}; +template struct fetch_data> { + using return_t = typename image_trait>::data_t; + using acc_data_t = typename image_trait>::acc_data_t; + + return_t operator()(acc_data_t &&origin_data) { + return return_t(origin_data.r(), origin_data.g()); + } +}; +template +struct fetch_data> + : public fetch_data> {}; +template struct fetch_data> { + using return_t = typename image_trait>::data_t; + using acc_data_t = typename image_trait>::acc_data_t; + + return_t operator()(acc_data_t &&origin_data) { + return return_t(origin_data.r(), origin_data.g(), origin_data.b(), + origin_data.a()); + } +}; + +/// Create image according with given type \p T and \p dims. +template static image_wrapper_base *create_image_wrapper(int dims); + +/// Create image with given data type \p T, channel order and dims +template +static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims); + +/// Create image with channel info and specified dimensions. +static image_wrapper_base *create_image_wrapper(image_channel channel, int dims); + +} // namespace detail + +/// Image channel info, include channel number, order, data width and type +class image_channel { + image_channel_data_type _type = image_channel_data_type::signed_int; + /// Number of channels. + unsigned _channel_num = 0; + /// Total size of all channels in bytes. + unsigned _total_size = 0; + /// Size of each channel in bytes. + unsigned _channel_size = 0; + +public: + /// Create image channel info according to template argument \p T. + template static image_channel create() { + image_channel channel; + channel.set_channel_size(detail::image_trait::channel_num, + sizeof(typename detail::image_trait::elem_t) * + 8); + channel.set_channel_data_type(detail::image_trait::data_type); + return channel; + } + + image_channel() = default; + + image_channel_data_type get_channel_data_type() { return _type; } + void set_channel_data_type(image_channel_data_type type) { _type = type; } + + unsigned get_total_size() { return _total_size; } + + unsigned get_channel_num() { return _channel_num; } + void set_channel_num(unsigned channel_num) { + _channel_num = channel_num; + _total_size = _channel_size * _channel_num; + } + + /// image_channel constructor. + /// \param r Channel r width in bits. + /// \param g Channel g width in bits. Should be same with \p r, or zero. + /// \param b Channel b width in bits. Should be same with \p g, or zero. + /// \param a Channel a width in bits. Should be same with \p b, or zero. + /// \param data_type Image channel data type: signed_nt, unsigned_int or fp. + image_channel(int r, int g, int b, int a, image_channel_data_type data_type) { + _type = data_type; + if (a) { + assert(r == a && "SYCL doesn't support different channel size"); + assert(r == b && "SYCL doesn't support different channel size"); + assert(r == g && "SYCL doesn't support different channel size"); + set_channel_size(4, a); + } else if (b) { + assert(r == b && "SYCL doesn't support different channel size"); + assert(r == g && "SYCL doesn't support different channel size"); + set_channel_size(3, b); + } else if (g) { + assert(r == g && "SYCL doesn't support different channel size"); + set_channel_size(2, g); + } else { + set_channel_size(1, r); + } + } + + sycl::image_channel_type get_channel_type() const { + if (_channel_size == 4) { + if (_type == image_channel_data_type::signed_int) + return sycl::image_channel_type::signed_int32; + else if (_type == image_channel_data_type::unsigned_int) + return sycl::image_channel_type::unsigned_int32; + else if (_type == image_channel_data_type::fp) + return sycl::image_channel_type::fp32; + } else if (_channel_size == 2) { + if (_type == image_channel_data_type::signed_int) + return sycl::image_channel_type::signed_int16; + else if (_type == image_channel_data_type::unsigned_int) + return sycl::image_channel_type::unsigned_int16; + else if (_type == image_channel_data_type::fp) + return sycl::image_channel_type::fp16; + } else { + if (_type == image_channel_data_type::signed_int) + return sycl::image_channel_type::signed_int8; + else if (_type == image_channel_data_type::unsigned_int) + return sycl::image_channel_type::unsigned_int8; + } + assert(false && "unexpected channel data kind and channel size"); + return sycl::image_channel_type::signed_int32; + } + void set_channel_type(sycl::image_channel_type type) { + switch (type) { + case sycl::image_channel_type::unsigned_int8: + _type = image_channel_data_type::unsigned_int; + _channel_size = 1; + break; + case sycl::image_channel_type::unsigned_int16: + _type = image_channel_data_type::unsigned_int; + _channel_size = 2; + break; + case sycl::image_channel_type::unsigned_int32: + _type = image_channel_data_type::unsigned_int; + _channel_size = 4; + break; + case sycl::image_channel_type::signed_int8: + _type = image_channel_data_type::signed_int; + _channel_size = 1; + break; + case sycl::image_channel_type::signed_int16: + _type = image_channel_data_type::signed_int; + _channel_size = 2; + break; + case sycl::image_channel_type::signed_int32: + _type = image_channel_data_type::signed_int; + _channel_size = 4; + break; + case sycl::image_channel_type::fp16: + _type = image_channel_data_type::fp; + _channel_size = 2; + break; + case sycl::image_channel_type::fp32: + _type = image_channel_data_type::fp; + _channel_size = 4; + break; + default: + break; + } + _total_size = _channel_size * _channel_num; + } + + sycl::image_channel_order get_channel_order() const { + switch (_channel_num) { + case 1: + return sycl::image_channel_order::r; + case 2: + return sycl::image_channel_order::rg; + case 3: + return sycl::image_channel_order::rgb; + case 4: + return sycl::image_channel_order::rgba; + default: + return sycl::image_channel_order::r; + } + } + /// Get the size for each channel in bits. + unsigned get_channel_size() const { return _channel_size * 8; } + + /// Set channel size. + /// \param in_channel_num Channels number to set. + /// \param channel_size Size for each channel in bits. + void set_channel_size(unsigned in_channel_num, + unsigned channel_size) { + if (in_channel_num < _channel_num) + return; + _channel_num = in_channel_num; + _channel_size = channel_size / 8; + _total_size = _channel_size * _channel_num; + } +}; + +/// 2D or 3D matrix data for image. +class image_matrix { + image_channel _channel; + int _range[3] = {1, 1, 1}; + int _dims = 0; + void *_host_data = nullptr; + + /// Set range of each dimension. + template void set_range(sycl::range range) { + for (int i = 0; i < dimensions; ++i) + _range[i] = range[i]; + _dims = dimensions; + } + + template + sycl::range get_range(integer_sequence) { + return sycl::range(_range[DimIdx]...); + } + +public: + /// Constructor with channel info and dimension size info. + template + image_matrix(image_channel channel, sycl::range range) + : _channel(channel) { + set_range(range); + _host_data = std::malloc(range.size() * _channel.get_total_size()); + } + image_matrix(sycl::image_channel_type channel_type, unsigned channel_num, + size_t x, size_t y) { + _channel.set_channel_type(channel_type); + _channel.set_channel_num(channel_num); + _dims = 1; + _range[0] = x; + if (y) { + _dims = 2; + _range[1] = y; + } + _host_data = std::malloc(_range[0] * _range[1] * _channel.get_total_size()); + } + + /// Construct a new image class with the matrix data. + template sycl::image *create_image() { + return create_image(_channel); + } + /// Construct a new image class with the matrix data. + template + sycl::image *create_image(image_channel channel) { + return new sycl::image( + _host_data, channel.get_channel_order(), channel.get_channel_type(), + get_range(make_index_sequence()), + sycl::property::image::use_host_ptr()); + } + + /// Get channel info. + inline image_channel get_channel() { return _channel; } + /// Get range of the image. + sycl::range<3> get_range() { + return sycl::range<3>(_range[0], _range[1], _range[2]); + } + /// Get matrix dims. + inline int get_dims() { return _dims; } + /// Convert to pitched data. + pitched_data to_pitched_data() { + return pitched_data(_host_data, _range[0] * _channel.get_total_size(), + _range[0], _range[1]); + } + + ~image_matrix() { + if (_host_data) + std::free(_host_data); + _host_data = nullptr; + } +}; +using image_matrix_p = image_matrix *; + +enum class image_data_type { matrix, linear, pitch, unsupport }; + +/// Image data info. +class image_data { +public: + image_data() { _type = image_data_type::unsupport; } + image_data(image_matrix_p matrix_data) { set_data(matrix_data); } + image_data(void *data_ptr, size_t x_size, image_channel channel) { + set_data(data_ptr, x_size, channel); + } + image_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size, + image_channel channel) { + set_data(data_ptr, x_size, y_size, pitch_size, channel); + } + void set_data(image_matrix_p matrix_data) { + _type = image_data_type::matrix; + _data = matrix_data; + _channel = matrix_data->get_channel(); + } + void set_data(void *data_ptr, size_t x_size, image_channel channel) { + _type = image_data_type::linear; + _data = data_ptr; + _x = x_size; + _channel = channel; + } + void set_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size, + image_channel channel) { + _type = image_data_type::pitch; + _data = data_ptr; + _x = x_size; + _y = y_size; + _pitch = pitch_size; + _channel = channel; + } + + image_data_type get_data_type() const { return _type; } + void set_data_type(image_data_type type) { _type = type; } + + void *get_data_ptr() const { return _data; } + void set_data_ptr(void *data) { _data = data; } + + size_t get_x() const { return _x; } + void set_x(size_t x) { _x = x; } + + size_t get_y() const { return _y; } + void set_y(size_t y) { _y = y; } + + size_t get_pitch() const { return _pitch; } + void set_pitch(size_t pitch) { _pitch = pitch; } + + image_channel get_channel() const { return _channel; } + void set_channel(image_channel channel) { _channel = channel; } + + image_channel_data_type get_channel_data_type() { + return _channel.get_channel_data_type(); + } + void set_channel_data_type(image_channel_data_type type) { + _channel.set_channel_data_type(type); + } + + unsigned get_channel_size() { return _channel.get_channel_size(); } + void set_channel_size(unsigned channel_num, unsigned channel_size) { + return _channel.set_channel_size(channel_num, channel_size); + } + + unsigned get_channel_num() { return _channel.get_channel_num(); } + void set_channel_num(unsigned num) { + return _channel.set_channel_num(num); + } + + sycl::image_channel_type get_channel_type() { + return _channel.get_channel_type(); + } + void set_channel_type(sycl::image_channel_type type) { + return _channel.set_channel_type(type); + } + +private: + image_data_type _type; + void *_data = nullptr; + size_t _x, _y, _pitch; + image_channel _channel; +}; + +/// Image sampling info, include addressing mode, filtering mode and +/// normalization info. +class sampling_info { + sycl::addressing_mode _addressing_mode = + sycl::addressing_mode::clamp_to_edge; + sycl::filtering_mode _filtering_mode = sycl::filtering_mode::nearest; + sycl::coordinate_normalization_mode _coordinate_normalization_mode = + sycl::coordinate_normalization_mode::unnormalized; + +public: + sycl::addressing_mode get_addressing_mode() { return _addressing_mode; } + void set(sycl::addressing_mode addressing_mode) { _addressing_mode = addressing_mode; } + + sycl::filtering_mode get_filtering_mode() { return _filtering_mode; } + void set(sycl::filtering_mode filtering_mode) { _filtering_mode = filtering_mode; } + + sycl::coordinate_normalization_mode get_coordinate_normalization_mode() { + return _coordinate_normalization_mode; + } + void set(sycl::coordinate_normalization_mode coordinate_normalization_mode) { + _coordinate_normalization_mode = coordinate_normalization_mode; + } + + bool is_coordinate_normalized() { + return _coordinate_normalization_mode == + sycl::coordinate_normalization_mode::normalized; + } + void set_coordinate_normalization_mode(int is_normalized) { + _coordinate_normalization_mode = + is_normalized ? sycl::coordinate_normalization_mode::normalized + : sycl::coordinate_normalization_mode::unnormalized; + } + void + set(sycl::addressing_mode addressing_mode, + sycl::filtering_mode filtering_mode, + sycl::coordinate_normalization_mode coordinate_normalization_mode) { + set(addressing_mode); + set(filtering_mode); + set(coordinate_normalization_mode); + } + void set(sycl::addressing_mode addressing_mode, + sycl::filtering_mode filtering_mode, int is_normalized) { + set(addressing_mode); + set(filtering_mode); + set_coordinate_normalization_mode(is_normalized); + } + + sycl::sampler get_sampler() { + return sycl::sampler(_coordinate_normalization_mode, _addressing_mode, + _filtering_mode); + } +}; + +/// Image base class. +class image_wrapper_base { + sampling_info _sampling_info; + image_data _data; + +public: + virtual ~image_wrapper_base() = 0; + + void attach(image_data data) { set_data(data); } + /// Attach matrix data to this class. + void attach(image_matrix *matrix) { + detach(); + image_wrapper_base::set_data(image_data(matrix)); + } + /// Attach matrix data to this class. + void attach(image_matrix *matrix, image_channel channel) { + attach(matrix); + image_wrapper_base::set_channel(channel); + } + /// Attach linear data to this class. + void attach(const void *ptr, size_t count) { + attach(ptr, count, get_channel()); + } + /// Attach linear data to this class. + void attach(const void *ptr, size_t count, image_channel channel) { + detach(); + image_wrapper_base::set_data(image_data(const_cast(ptr), count, channel)); + } + /// Attach 2D data to this class. + void attach(const void *data, size_t x, size_t y, size_t pitch) { + attach(data, x, y, pitch, get_channel()); + } + /// Attach 2D data to this class. + void attach(const void *data, size_t x, size_t y, size_t pitch, + image_channel channel) { + detach(); + image_wrapper_base::set_data( + image_data(const_cast(data), x, y, pitch, channel)); + } + /// Detach data. + virtual void detach() {} + + sampling_info get_sampling_info() { return _sampling_info; } + void set_sampling_info(sampling_info info) { + _sampling_info = info; + } + const image_data &get_data() { return _data; } + void set_data(image_data data) { _data = data; } + + image_channel get_channel() { return _data.get_channel(); } + void set_channel(image_channel channel) { _data.set_channel(channel); } + + image_channel_data_type get_channel_data_type() { + return _data.get_channel_data_type(); + } + void set_channel_data_type(image_channel_data_type type) { + _data.set_channel_data_type(type); + } + + unsigned get_channel_size() { return _data.get_channel_size(); } + void set_channel_size(unsigned channel_num, unsigned channel_size) { + return _data.set_channel_size(channel_num, channel_size); + } + + sycl::addressing_mode get_addressing_mode() { + return _sampling_info.get_addressing_mode(); + } + void set(sycl::addressing_mode addressing_mode) { + _sampling_info.set(addressing_mode); + } + + sycl::filtering_mode get_filtering_mode() { + return _sampling_info.get_filtering_mode(); + } + void set(sycl::filtering_mode filtering_mode) { + _sampling_info.set(filtering_mode); + } + + sycl::coordinate_normalization_mode get_coordinate_normalization_mode() { + return _sampling_info.get_coordinate_normalization_mode(); + } + void + set(sycl::coordinate_normalization_mode coordinate_normalization_mode) { + _sampling_info.set(coordinate_normalization_mode); + } + + bool is_coordinate_normalized() { + return _sampling_info.is_coordinate_normalized(); + } + void set_coordinate_normalization_mode(int is_normalized) { + _sampling_info.set_coordinate_normalization_mode(is_normalized); + } + void + set(sycl::addressing_mode addressing_mode, + sycl::filtering_mode filtering_mode, + sycl::coordinate_normalization_mode coordinate_normalization_mode) { + set(addressing_mode); + set(filtering_mode); + set(coordinate_normalization_mode); + } + void set(sycl::addressing_mode addressing_mode, + sycl::filtering_mode filtering_mode, int is_normalized) { + set(addressing_mode); + set(filtering_mode); + set_coordinate_normalization_mode(is_normalized); + } + + unsigned get_channel_num() { return _data.get_channel_num(); } + void set_channel_num(unsigned num) { + return _data.set_channel_num(num); + } + + sycl::image_channel_type get_channel_type() { + return _data.get_channel_type(); + } + void set_channel_type(sycl::image_channel_type type) { + return _data.set_channel_type(type); + } + + sycl::sampler get_sampler() { + sycl::sampler smp = _sampling_info.get_sampler(); + /// linear memory only used for sycl::filtering_mode::nearest. + if (_data.get_data_type() == image_data_type::linear) { + smp = sycl::sampler(smp.get_coordinate_normalization_mode(), + smp.get_addressing_mode(), + sycl::filtering_mode::nearest); + } + return smp; + } +}; +inline image_wrapper_base::~image_wrapper_base() {} +using image_wrapper_base_p = image_wrapper_base *; + +template class image_accessor_ext; + +/// Image class, wrapper of sycl::image. +template class image_wrapper : public image_wrapper_base { + sycl::image *_image = nullptr; + +#ifndef DPCT_USM_LEVEL_NONE + std::vector _host_buffer; +#endif + + void create_image(sycl::queue q) { + auto &data = get_data(); + if (data.get_data_type() == image_data_type::matrix) { + _image = static_cast(data.get_data_ptr()) + ->create_image(data.get_channel()); + return; + } + auto ptr = data.get_data_ptr(); + auto channel = data.get_channel(); + + if (detail::get_pointer_attribute(q, ptr) == detail::pointer_access_attribute::device_only) { +#ifdef DPCT_USM_LEVEL_NONE + ptr = get_buffer(ptr) + .template get_access() + .get_pointer(); +#else + auto sz = data.get_x(); + if (data.get_data_type() == image_data_type::pitch) + sz *= channel.get_total_size() * data.get_y(); + _host_buffer.resize(sz); + q.memcpy(_host_buffer.data(), ptr, sz).wait(); + ptr = _host_buffer.data(); +#endif + } + + if constexpr (dimensions == 1) { + assert(data.get_data_type() == image_data_type::linear); + _image = new sycl::image<1>( + ptr, channel.get_channel_order(), channel.get_channel_type(), + sycl::range<1>(data.get_x() / channel.get_total_size())); + } else if constexpr (dimensions == 2) { + assert(data.get_data_type() == image_data_type::pitch); + _image = new sycl::image<2>(ptr, channel.get_channel_order(), + channel.get_channel_type(), + sycl::range<2>(data.get_x(), data.get_y()), + sycl::range<1>(data.get_pitch())); + } else { + throw std::runtime_error("3D image only support matrix data"); + } + return; + } + +public: + using acc_data_t = typename detail::image_trait::acc_data_t; + using accessor_t = + typename image_accessor_ext::accessor_t; + + image_wrapper() { set_channel(image_channel::create()); } + ~image_wrapper() { detach(); } + + /// Get image accessor. + accessor_t get_access(sycl::handler &cgh, sycl::queue &q = get_default_queue()) { + if (!_image) + create_image(q); + return accessor_t(*_image, cgh); + } + + /// Detach data. + void detach() override { + if (_image) + delete _image; + _image = nullptr; + } +}; + +/// Wrap sampler and image accessor together. +template +class image_accessor_ext { +public: + using accessor_t = + typename detail::image_trait::template accessor_t; + using data_t = typename detail::image_trait::data_t; + sycl::sampler _sampler; + accessor_t _img_acc; + +public: + image_accessor_ext(sycl::sampler sampler, accessor_t acc) + : _sampler(sampler), _img_acc(acc) {} + + /// Read data from accessor. + template + typename std::enable_if::type read(float x, float y, + float z) { + return detail::fetch_data()( + _img_acc.read(sycl::float4(x, y, z, 0), _sampler)); + } + /// Read data from accessor. + template ::value + &&std::is_integral::value + &&std::is_integral::value> + typename std::enable_if::type read(Coord0 x, Coord1 y, + Coord2 z) { + return detail::fetch_data()( + _img_acc.read(sycl::int4(x, y, z, 0), _sampler)); + } + /// Read data from accessor. + template + typename std::enable_if::type read(float x, float y) { + return detail::fetch_data()( + _img_acc.read(sycl::float2(x, y), _sampler)); + } + /// Read data from accessor. + template ::value + &&std::is_integral::value> + typename std::enable_if::type read(Coord0 x, Coord1 y) { + return detail::fetch_data()( + _img_acc.read(sycl::int2(x, y), _sampler)); + } + /// Read data from accessor. + template + typename std::enable_if::type read(float x) { + return detail::fetch_data()(_img_acc.read(x, _sampler)); + } + /// Read data from accessor. + template ::value> + typename std::enable_if::type read(CoordT x) { + return detail::fetch_data()(_img_acc.read(x, _sampler)); + } +}; + +template class image_accessor_ext { +public: + using accessor_t = + typename detail::image_trait::template array_accessor_t; + using data_t = typename detail::image_trait::data_t; + sycl::sampler _sampler; + accessor_t _img_acc; + +public: + image_accessor_ext(sycl::sampler sampler, accessor_t acc) + : _sampler(sampler), _img_acc(acc) {} + + /// Read data from accessor. + template + typename std::enable_if::type read(int index, float x, + float y) { + return detail::fetch_data()( + _img_acc[index].read(sycl::float2(x, y), _sampler)); + } + /// Read data from accessor. + template + typename std::enable_if::type read(int index, int x, int y) { + return detail::fetch_data()( + _img_acc[index].read(sycl::int2(x, y), _sampler)); + } + /// Read data from accessor. + template + typename std::enable_if::type read(int index, float x) { + return detail::fetch_data()( + _img_acc[index].read(x, _sampler)); + } + /// Read data from accessor. + template + typename std::enable_if::type read(int index, int x) { + return detail::fetch_data()( + _img_acc[index].read(x, _sampler)); + } +}; + +/// Create image wrapper according to image data and sampling info. +/// \return Pointer to image wrapper base class. +/// \param data Image data used to create image wrapper. +/// \param info Image sampling info used to create image wrapper. +/// \returns Pointer to base class of created image wrapper object. +static inline image_wrapper_base *create_image_wrapper(image_data data, + sampling_info info) { + image_channel channel; + int dims = 1; + if (data.get_data_type() == image_data_type::matrix) { + auto matrix = (image_matrix_p)data.get_data_ptr(); + channel = matrix->get_channel(); + dims = matrix->get_dims(); + } else { + if (data.get_data_type() == image_data_type::pitch) { + dims = 2; + } + channel = data.get_channel(); + } + + if (auto ret = detail::create_image_wrapper(channel, dims)) { + ret->set_sampling_info(info); + ret->set_data(data); + return ret; + } + return nullptr; +} + +namespace detail { +/// Create image according with given type \p T and \p dims. +template static image_wrapper_base *create_image_wrapper(int dims) { + switch (dims) { + case 1: + return new image_wrapper(); + case 2: + return new image_wrapper(); + case 3: + return new image_wrapper(); + default: + return nullptr; + } +} +/// Create image with given data type \p T, channel order and dims +template +static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims) { + switch (channel_num) { + case 1: + return create_image_wrapper(dims); + case 2: + return create_image_wrapper>(dims); + case 3: + return create_image_wrapper>(dims); + case 4: + return create_image_wrapper>(dims); + default: + return nullptr; + } +} + +/// Create image with channel info and specified dimensions. +static image_wrapper_base *create_image_wrapper(image_channel channel, int dims) { + switch (channel.get_channel_type()) { + case sycl::image_channel_type::fp16: + return create_image_wrapper(channel.get_channel_num(), dims); + case sycl::image_channel_type::fp32: + return create_image_wrapper(channel.get_channel_num(), dims); + case sycl::image_channel_type::signed_int8: + return create_image_wrapper(channel.get_channel_num(), dims); + case sycl::image_channel_type::signed_int16: + return create_image_wrapper(channel.get_channel_num(), dims); + case sycl::image_channel_type::signed_int32: + return create_image_wrapper(channel.get_channel_num(), dims); + case sycl::image_channel_type::unsigned_int8: + return create_image_wrapper(channel.get_channel_num(), dims); + case sycl::image_channel_type::unsigned_int16: + return create_image_wrapper(channel.get_channel_num(), dims); + case sycl::image_channel_type::unsigned_int32: + return create_image_wrapper(channel.get_channel_num(), dims); + default: + return nullptr; + } +} +} // namespace detail + +} // namespace dpct + +#endif // !__DPCT_IMAGE_HPP__ diff --git a/dpct/kernel.hpp b/dpct/kernel.hpp new file mode 100644 index 0000000000000..11d1321bb4086 --- /dev/null +++ b/dpct/kernel.hpp @@ -0,0 +1,459 @@ +//==---- kernel.hpp -------------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_KERNEL_HPP__ +#define __DPCT_KERNEL_HPP__ + +#include +#ifdef _WIN32 +#include +#include +#else +#include +#endif + +#if defined(__has_include) && __has_include() +#include +#elif defined(__has_include) && __has_include() +#include +#else +#error "SYCLomatic runtime requires C++ filesystem support" +#endif + +#include +#include +#include + +namespace dpct { + +typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &, + unsigned int, void **, void **); + +struct kernel_function_info { + int max_work_group_size = 0; +}; + +static inline void get_kernel_function_info(kernel_function_info *kernel_info, + const void *function) { + kernel_info->max_work_group_size = + dpct::dev_mgr::instance() + .current_device() + .get_info(); +} +static inline kernel_function_info +get_kernel_function_info(const void *function) { + kernel_function_info kernel_info; + kernel_info.max_work_group_size = + dpct::dev_mgr::instance() + .current_device() + .get_info(); + return kernel_info; +} + + +namespace detail { + +#if defined(__has_include) && __has_include() +namespace fs = std::filesystem; +#else +namespace fs = std::experimental::filesystem; +#endif + +/// Write data to temporary file and return absolute path to temporary file. +/// Temporary file is created in a temporary directory both of which have random +/// names with only the user having access permissions. Only one temporary file +/// will be created in the temporary directory. +static inline fs::path write_data_to_file(char const *const data, size_t size) { + std::error_code ec; + + if (sizeof(size_t) >= sizeof(std::streamsize) && + size > (std::numeric_limits::max)()) + throw std::runtime_error("data file too large"); + + // random number generator + std::random_device dev; + std::mt19937 prng(dev()); + std::uniform_int_distribution rand(0); + + // find temporary directory + auto tmp_dir = fs::temp_directory_path(ec); + if (ec) + throw std::runtime_error("could not find temporary directory"); + + // create private directory + std::stringstream directory; + fs::path directory_path; + constexpr int max_attempts = 5; + int i; + + for (i = 0; i < max_attempts; i++) { + directory << std::hex << rand(prng); + directory_path = tmp_dir / directory.str(); + if (fs::create_directory(directory_path)) { + break; + } + } + if (i == max_attempts) + throw std::runtime_error("could not create directory"); + + // only allow owner permissions to private directory + fs::permissions(directory_path, fs::perms::owner_all, ec); + if (ec) + throw std::runtime_error("could not set directory permissions"); + + // random filename in private directory + std::stringstream filename; + filename << std::hex << rand(prng); +#ifdef _WIN32 + auto filepath = directory_path / (filename.str() + ".dll"); +#else + auto filepath = directory_path / filename.str(); +#endif + + // write data to temporary file + auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary); + if (outfile) { + // only allow program to write file + fs::permissions(filepath, fs::perms::owner_write, ec); + if (ec) + throw std::runtime_error("could not set permissions"); + + outfile.write(data, size); + if (!outfile.good()) + throw std::runtime_error("could not write data"); + outfile.close(); + + // only allow program to read/execute file + fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec, + ec); + if (ec) + throw std::runtime_error("could not set permissions"); + } else + throw std::runtime_error("could not write data"); + + // check temporary file contents + auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary); + if (infile) { + bool mismatch = false; + size_t cnt = 0; + + while (1) { + char c; + infile.get(c); + if (infile.eof()) + break; + if (c != data[cnt++]) + mismatch = true; + } + if (cnt != size || mismatch) + throw std::runtime_error("file contents not written correctly"); + } else + throw std::runtime_error("could not validate file"); + + if (!filepath.is_absolute()) + throw std::runtime_error("temporary filepath is not absolute"); + + return filepath; +} + +static inline uint16_t extract16(unsigned char const *const ptr) { + uint16_t ret = 0; + + ret |= static_cast(ptr[0]) << 0; + ret |= static_cast(ptr[1]) << 8; + + return (ret); +} + +static inline uint32_t extract32(unsigned char const *const ptr) { + uint32_t ret = 0; + + ret |= static_cast(ptr[0]) << 0; + ret |= static_cast(ptr[1]) << 8; + ret |= static_cast(ptr[2]) << 16; + ret |= static_cast(ptr[3]) << 24; + + return (ret); +} + +static inline uint64_t extract64(unsigned char const *const ptr) { + uint64_t ret = 0; + + ret |= static_cast(ptr[0]) << 0; + ret |= static_cast(ptr[1]) << 8; + ret |= static_cast(ptr[2]) << 16; + ret |= static_cast(ptr[3]) << 24; + ret |= static_cast(ptr[4]) << 32; + ret |= static_cast(ptr[5]) << 40; + ret |= static_cast(ptr[6]) << 48; + ret |= static_cast(ptr[7]) << 56; + + return (ret); +} + +static inline uint64_t get_lib_size(char const *const blob) { +#ifdef _WIN32 + /////////////////////////////////////////////////////////////////////// + // Analyze DOS stub + unsigned char const *const ublob = + reinterpret_cast(blob); + if (ublob[0] != 0x4d || ublob[1] != 0x5a) { + throw std::runtime_error("Blob is not a Windows DLL."); + } + uint32_t pe_header_offset = extract32(ublob + 0x3c); + + /////////////////////////////////////////////////////////////////////// + // Ananlyze PE-header + unsigned char const *const pe_header = ublob + pe_header_offset; + + // signature + uint32_t pe_signature = extract32(pe_header + 0); + if (pe_signature != 0x00004550) { + throw std::runtime_error("PE-header signature is not 0x00004550"); + } + + // machine + uint16_t machine = extract16(pe_header + 4); + if (machine != 0x8664) { + throw std::runtime_error("Only DLLs for x64 supported"); + } + + // number of sections + uint16_t number_of_sections = extract16(pe_header + 6); + + // sizeof optional header + uint16_t sizeof_optional_header = extract16(pe_header + 20); + + // magic + uint16_t magic = extract16(pe_header + 24); + if (magic != 0x10b && magic != 0x20b) { + throw std::runtime_error("MAGIC is not 0x010b or 0x020b"); + } + + /////////////////////////////////////////////////////////////////////// + // Analyze tail of optional header + constexpr int coff_header_size = 24; + + unsigned char const *const tail_of_optional_header = + pe_header + coff_header_size + sizeof_optional_header; + if (extract64(tail_of_optional_header - 8) != 0) { + throw std::runtime_error("Optional header not zero-padded"); + } + + /////////////////////////////////////////////////////////////////////// + // Analyze last section header + constexpr int section_header_size = 40; + unsigned char const *const last_section_header = + tail_of_optional_header + section_header_size * (number_of_sections - 1); + + uint32_t sizeof_raw_data = extract32(last_section_header + 16); + uint32_t pointer_to_raw_data = extract32(last_section_header + 20); + + return sizeof_raw_data + pointer_to_raw_data; +#else + if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F') + throw std::runtime_error("Blob is not in ELF format"); + + if (blob[4] != 0x02) + throw std::runtime_error("Only 64-bit headers are supported"); + + if (blob[5] != 0x01) + throw std::runtime_error("Only little-endian headers are supported"); + + unsigned char const *const ublob = + reinterpret_cast(blob); + uint64_t e_shoff = extract64(ublob + 0x28); + uint16_t e_shentsize = extract16(ublob + 0x3A); + uint16_t e_shnum = extract16(ublob + 0x3C); + + return e_shoff + (e_shentsize * e_shnum); +#endif +} + +#ifdef _WIN32 +class path_lib_record { +public: + void operator=(const path_lib_record &) = delete; + ~path_lib_record() { + for (auto entry : lib_to_path) { + FreeLibrary(static_cast(entry.first)); + fs::permissions(entry.second, fs::perms::owner_all); + fs::remove_all(entry.second.remove_filename()); + } + } + static void record_lib_path(fs::path path, void *library) { + lib_to_path[library] = path; + } + static void remove_lib(void *library) { + auto path = lib_to_path[library]; + std::error_code ec; + + FreeLibrary(static_cast(library)); + fs::permissions(path, fs::perms::owner_all); + if (fs::remove_all(path.remove_filename(), ec) != 2 || ec) + // one directory and one temporary file should have been deleted + throw std::runtime_error("Directory delete failed"); + + lib_to_path.erase(library); + } + +private: + static inline std::unordered_map lib_to_path; +}; +#endif + +} // namespace detail + +class kernel_library { +public: + kernel_library() : ptr{nullptr} {} + kernel_library(void *ptr) : ptr{ptr} {} + + operator void *() const { return ptr; } + +private: + void *ptr; +#ifdef _WIN32 + static inline detail::path_lib_record single_instance_to_trigger_destructor; +#endif +}; + +namespace detail { + +static inline kernel_library load_dl_from_data(char const *const data, + size_t size) { + fs::path filename = write_data_to_file(data, size); +#ifdef _WIN32 + void *so = LoadLibraryW(filename.wstring().c_str()); +#else + void *so = dlopen(filename.c_str(), RTLD_LAZY); +#endif + if (so == nullptr) + throw std::runtime_error("Failed to load kernel library"); + +#ifdef _WIN32 + detail::path_lib_record::record_lib_path(filename, so); +#else + std::error_code ec; + + // Windows DLL cannot be deleted while in use + if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec) + // one directory and one temporary file should have been deleted + throw std::runtime_error("Directory delete failed"); +#endif + + return so; +} + +} // namespace detail + +/// Load kernel library and return a handle to use the library. +/// \param [in] name The name of the library. +static inline kernel_library load_kernel_library(const std::string &name) { + std::ifstream ifs; + ifs.open(name, std::ios::in | std::ios::binary); + + std::stringstream buffer; + buffer << ifs.rdbuf(); + + const std::string buffer_string = buffer.str(); + return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size()); +} + +/// Load kernel library whose image is alreay in memory and return a handle to +/// use the library. +/// \param [in] image A pointer to the image in memory. +static inline kernel_library load_kernel_library_mem(char const *const image) { + const size_t size = detail::get_lib_size(image); + + return detail::load_dl_from_data(image, size); +} + +/// Unload kernel library. +/// \param [in,out] library Handle to the library to be closed. +static inline void unload_kernel_library(const kernel_library &library) { +#ifdef _WIN32 + detail::path_lib_record::remove_lib(library); +#else + dlclose(library); +#endif +} + +class kernel_function { +public: + kernel_function() : ptr{nullptr} {} + kernel_function(dpct::kernel_functor ptr) : ptr{ptr} {} + + operator void *() const { return ((void *)ptr); } + + void operator()(sycl::queue &q, const sycl::nd_range<3> &range, + unsigned int a, void **args, void **extra) { + ptr(q, range, a, args, extra); + } + +private: + dpct::kernel_functor ptr; +}; + +/// Find kernel function in a kernel library and return its address. +/// \param [in] library Handle to the kernel library. +/// \param [in] name Name of the kernel function. +static inline dpct::kernel_function +get_kernel_function(kernel_library &library, const std::string &name) { +#ifdef _WIN32 + dpct::kernel_functor fn = reinterpret_cast( + GetProcAddress(static_cast(static_cast(library)), + (name + std::string("_wrapper")).c_str())); +#else + dpct::kernel_functor fn = reinterpret_cast( + dlsym(library, (name + std::string("_wrapper")).c_str())); +#endif + if (fn == nullptr) + throw std::runtime_error("Failed to get function"); + return fn; +} + +/// Invoke a kernel function. +/// \param [in] function kernel function. +/// \param [in] queue SYCL queue used to execute kernel +/// \param [in] groupRange SYCL group range +/// \param [in] localRange SYCL local range +/// \param [in] localMemSize The size of local memory required by the kernel +/// function. +/// \param [in] kernelParams Array of pointers to kernel arguments. +/// \param [in] extra Extra arguments. +static inline void invoke_kernel_function(dpct::kernel_function &function, + sycl::queue &queue, + sycl::range<3> groupRange, + sycl::range<3> localRange, + unsigned int localMemSize, + void **kernelParams, void **extra) { + function(queue, sycl::nd_range<3>(groupRange * localRange, localRange), + localMemSize, kernelParams, extra); +} + +/// Find image wrapper in a kernel library and return its address. +/// \param [in] library Handle to the kernel library. +/// \param [in] name Name of the target image wrapper. +static inline dpct::image_wrapper_base_p +get_image_wrapper(dpct::kernel_library &library, const std::string &name) { +#ifdef _WIN32 + dpct::image_wrapper_base_p fn = + reinterpret_cast(GetProcAddress( + static_cast(static_cast(library)), name.c_str())); +#else + dpct::image_wrapper_base_p fn = reinterpret_cast( + dlsym(library, name.c_str())); +#endif + if (fn == nullptr) + throw std::runtime_error("Failed to get image"); + return fn; +} + +} // namespace dpct +#endif // __DPCT_KERNEL_HPP__ diff --git a/dpct/lapack_utils.hpp b/dpct/lapack_utils.hpp new file mode 100644 index 0000000000000..dac77d5773ec4 --- /dev/null +++ b/dpct/lapack_utils.hpp @@ -0,0 +1,1953 @@ +//==---- lapack_utils.hpp -------------------------*- C++ -*----------------==// +// +// Copyright (C) Intel Corporation +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef __DPCT_LAPACK_UTILS_HPP__ +#define __DPCT_LAPACK_UTILS_HPP__ + +#include "memory.hpp" +#include "util.hpp" +#include "lib_common_utils.hpp" + +#include +#include + +namespace dpct { +namespace lapack { +/// Computes all eigenvalues and, optionally, eigenvectors of a real generalized +/// symmetric definite eigenproblem using a divide and conquer method. +/// \return Returns 0 if no synchronous exception, otherwise returns 1. +/// \param [in] queue Device queue where calculations will be performed. It must +/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is +/// not defined). +/// \param [in] itype Must be 1 or 2 or 3. Specifies the problem type to be solved. +/// \param [in] jobz Must be job::novec or job::vec. +/// \param [in] uplo Must be uplo::upper or uplo::lower. +/// \param [in] n The order of the matrices A and B. +/// \param [in,out] a The symmetric matrix A. +/// \param [in] lda The leading dimension of matrix A. +/// \param [in,out] b The symmetric matrix B. +/// \param [in] ldb The leading dimension of matrix B. +/// \param [out] w Eigenvalues. +/// \param [in] scratchpad Scratchpad memory to be used by the routine +/// for storing intermediate results. +/// \param [in] scratchpad_size Size of scratchpad memory as a number of +/// floating point elements of type T. +/// \param [out] info If lapack synchronous exception is caught, the value +/// returned from info() method of the exception is set to \p info. +template +inline int sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, int n, T *a, int lda, T *b, int ldb, + T *w, T *scratchpad, int scratchpad_size, int *info) { +#ifdef DPCT_USM_LEVEL_NONE + auto info_buf = get_buffer(info); + auto a_buffer = get_buffer(a); + auto b_buffer = get_buffer(b); + auto w_buffer = get_buffer(w); + auto scratchpad_buffer = get_buffer(scratchpad); + int info_val = 0; + int ret_val = 0; + try { + oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a_buffer, lda, + b_buffer, ldb, w_buffer, scratchpad_buffer, + scratchpad_size); + } catch (oneapi::mkl::lapack::exception const& e) { + std::cerr << "Unexpected exception caught during call to LAPACK API: sygvd" + << std::endl + << "reason: " << e.what() << std::endl + << "info: " << e.info() << std::endl; + info_val = static_cast(e.info()); + ret_val = 1; + } catch (sycl::exception const& e) { + std::cerr << "Caught synchronous SYCL exception:" << std::endl + << "reason: " << e.what() << std::endl; + ret_val = 1; + } + queue.submit([&, info_val](sycl::handler &cgh) { + auto info_acc = info_buf.get_access(cgh); + cgh.single_task>( + [=]() { info_acc[0] = info_val; }); + }); + return ret_val; +#else + try { + oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, + scratchpad, scratchpad_size); + } catch (oneapi::mkl::lapack::exception const& e) { + std::cerr << "Unexpected exception caught during call to LAPACK API: sygvd" + << std::endl + << "reason: " << e.what() << std::endl + << "info: " << e.info() << std::endl; + int info_val = static_cast(e.info()); + queue.memcpy(info, &info_val, sizeof(int)).wait(); + return 1; + } catch (sycl::exception const& e) { + std::cerr << "Caught synchronous SYCL exception:" << std::endl + << "reason: " << e.what() << std::endl; + queue.memset(info, 0, sizeof(int)).wait(); + return 1; + } + queue.memset(info, 0, sizeof(int)); + return 0; +#endif +} +/// Computes all the eigenvalues, and optionally, the eigenvectors of a complex +/// generalized Hermitian positive-definite eigenproblem using a divide and +/// conquer method. +/// \return Returns 0 if no synchronous exception, otherwise returns 1. +/// \param [in] queue Device queue where calculations will be performed. It must +/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is +/// not defined). +/// \param [in] itype Must be 1 or 2 or 3. Specifies the problem type to be solved. +/// \param [in] jobz Must be job::novec or job::vec. +/// \param [in] uplo Must be uplo::upper or uplo::lower. +/// \param [in] n The order of the matrices A and B. +/// \param [in,out] a The Hermitian matrix A. +/// \param [in] lda The leading dimension of matrix A. +/// \param [in,out] b The Hermitian matrix B. +/// \param [in] ldb The leading dimension of matrix B. +/// \param [in] w Eigenvalues. +/// \param [in] scratchpad Scratchpad memory to be used by the routine +/// for storing intermediate results. +/// \param [in] scratchpad_size Size of scratchpad memory as a number of +/// floating point elements of type T. +/// \param [out] info If lapack synchronous exception is caught, the value +/// returned from info() method of the exception is set to \p info. +template +inline int hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, int n, T *a, int lda, T *b, int ldb, + Tw *w, T *scratchpad, int scratchpad_size, int *info) { + using Ty = typename DataType::T2; +#ifdef DPCT_USM_LEVEL_NONE + auto info_buf = get_buffer(info); + auto a_buffer = get_buffer(a); + auto b_buffer = get_buffer(b); + auto w_buffer = get_buffer(w); + auto scratchpad_buffer = get_buffer(scratchpad); + int info_val = 0; + int ret_val = 0; + try { + oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a_buffer, lda, + b_buffer, ldb, w_buffer, scratchpad_buffer, + scratchpad_size); + } catch (oneapi::mkl::lapack::exception const& e) { + std::cerr << "Unexpected exception caught during call to LAPACK API: hegvd" + << std::endl + << "reason: " << e.what() << std::endl + << "info: " << e.info() << std::endl; + info_val = static_cast(e.info()); + ret_val = 1; + } catch (sycl::exception const& e) { + std::cerr << "Caught synchronous SYCL exception:" << std::endl + << "reason: " << e.what() << std::endl; + ret_val = 1; + } + queue.submit([&, info_val](sycl::handler &cgh) { + auto info_acc = info_buf.get_access(cgh); + cgh.single_task>( + [=]() { info_acc[0] = info_val; }); + }); + return ret_val; +#else + try { + oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, (Ty *)a, lda, (Ty *)b, + ldb, w, (Ty *)scratchpad, scratchpad_size); + } catch (oneapi::mkl::lapack::exception const& e) { + std::cerr << "Unexpected exception caught during call to LAPACK API: hegvd" + << std::endl + << "reason: " << e.what() << std::endl + << "info: " << e.info() << std::endl; + int info_val = static_cast(e.info()); + queue.memcpy(info, &info_val, sizeof(int)).wait(); + return 1; + } catch (sycl::exception const& e) { + std::cerr << "Caught synchronous SYCL exception:" << std::endl + << "reason: " << e.what() << std::endl; + queue.memset(info, 0, sizeof(int)).wait(); + return 1; + } + queue.memset(info, 0, sizeof(int)); + return 0; +#endif +} +/// Computes the Cholesky factorizations of a batch of symmetric (or Hermitian, +/// for complex data) positive-definite matrices. +/// \return Returns 0 if no synchronous exception, otherwise returns 1. +/// \param [in] queue Device queue where calculations will be performed. It must +/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is +/// not defined). +/// \param [in] uplo Must be uplo::upper or uplo::lower. +/// \param [in] n The order of the matrix A. +/// \param [in,out] a Array of pointers to matrix A. +/// \param [in] lda The leading dimension of matrix A. +/// \param [out] info If lapack synchronous exception is caught, the value +/// returned from info() method of the exception is set to \p info. +/// \param [in] group_size The batch size. +template +inline int potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, int n, + T *a[], int lda, int *info, int group_size) { +#ifdef DPCT_USM_LEVEL_NONE + throw std::runtime_error("this API is unsupported when USM level is none"); +#else + using Ty = typename DataType::T2; + struct matrix_info_t { + oneapi::mkl::uplo uplo_info; + std::int64_t n_info; + std::int64_t lda_info; + std::int64_t group_size_info; + }; + matrix_info_t *matrix_info = + (matrix_info_t *)std::malloc(sizeof(matrix_info_t)); + matrix_info->uplo_info = uplo; + matrix_info->n_info = n; + matrix_info->lda_info = lda; + matrix_info->group_size_info = group_size; + std::int64_t scratchpad_size = 0; + sycl::event e; + Ty *scratchpad = nullptr; + try { + scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size( + queue, &(matrix_info->uplo_info), &(matrix_info->n_info), + &(matrix_info->lda_info), 1, &(matrix_info->group_size_info)); + scratchpad = sycl::malloc_device(scratchpad_size, queue); + e = oneapi::mkl::lapack::potrf_batch( + queue, &(matrix_info->uplo_info), &(matrix_info->n_info), (Ty **)a, + &(matrix_info->lda_info), 1, &(matrix_info->group_size_info), + scratchpad, scratchpad_size); + } catch (oneapi::mkl::lapack::batch_error const &be) { + std::cerr << "Unexpected exception caught during call to LAPACK API: " + "potrf_batch_scratchpad_size/potrf_batch" + << std::endl + << "reason: " << be.what() << std::endl + << "number: " << be.info() << std::endl; + int i = 0; + auto &ids = be.ids(); + std::vector info_vec(group_size); + for (auto const &e : be.exceptions()) { + try { + std::rethrow_exception(e); + } catch (oneapi::mkl::lapack::exception &e) { + std::cerr << "Exception " << ids[i] << std::endl + << "reason: " << e.what() << std::endl + << "info: " << e.info() << std::endl; + info_vec[i] = e.info(); + i++; + } + } + queue.memcpy(info, info_vec.data(), group_size * sizeof(int)).wait(); + std::free(matrix_info); + if (scratchpad) + sycl::free(scratchpad, queue); + return 1; + } catch (sycl::exception const &e) { + std::cerr << "Caught synchronous SYCL exception:" << std::endl + << "reason: " << e.what() << std::endl; + queue.memset(info, 0, group_size * sizeof(int)).wait(); + std::free(matrix_info); + if (scratchpad) + sycl::free(scratchpad, queue); + return 1; + } + queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(e); + cgh.host_task([=] { + std::free(matrix_info); + sycl::free(scratchpad, queue); + }); + }); + queue.memset(info, 0, group_size * sizeof(int)); + return 0; +#endif +} +/// Solves a batch of systems of linear equations with a Cholesky-factored +/// symmetric (Hermitian) positive-definite coefficient matrices. +/// \return Returns 0 if no synchronous exception, otherwise returns 1. +/// \param [in] queue Device queue where calculations will be performed. It must +/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is +/// not defined). +/// \param [in] uplo Must be uplo::upper or uplo::lower. +/// \param [in] n The order of the matrix A. +/// \param [in] nrhs The number of right-hand sides. +/// \param [in,out] a Array of pointers to matrix A. +/// \param [in] lda The leading dimension of matrix A. +/// \param [in,out] b Array of pointers to matrix B. +/// \param [in] ldb The leading dimension of matrix B. +/// \param [out] info If lapack synchronous exception is caught, the value +/// returned from info() method of the exception is set to \p info. +/// \param [in] group_size The batch size. +template +inline int potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, int n, + int nrhs, T *a[], int lda, T *b[], int ldb, int *info, + int group_size) { +#ifdef DPCT_USM_LEVEL_NONE + throw std::runtime_error("this API is unsupported when USM level is none"); +#else + using Ty = typename DataType::T2; + struct matrix_info_t { + oneapi::mkl::uplo uplo_info; + std::int64_t n_info; + std::int64_t nrhs_info; + std::int64_t lda_info; + std::int64_t ldb_info; + std::int64_t group_size_info; + }; + matrix_info_t *matrix_info = + (matrix_info_t *)std::malloc(sizeof(matrix_info_t)); + matrix_info->uplo_info = uplo; + matrix_info->n_info = n; + matrix_info->nrhs_info = nrhs; + matrix_info->lda_info = lda; + matrix_info->ldb_info = ldb; + matrix_info->group_size_info = group_size; + std::int64_t scratchpad_size = 0; + sycl::event e; + Ty *scratchpad = nullptr; + try { + scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size( + queue, &(matrix_info->uplo_info), &(matrix_info->n_info), + &(matrix_info->nrhs_info), &(matrix_info->lda_info), + &(matrix_info->ldb_info), 1, &(matrix_info->group_size_info)); + scratchpad = sycl::malloc_device(scratchpad_size, queue); + e = oneapi::mkl::lapack::potrs_batch( + queue, &(matrix_info->uplo_info), &(matrix_info->n_info), + &(matrix_info->nrhs_info), (Ty **)a, &(matrix_info->lda_info), (Ty **)b, + &(matrix_info->ldb_info), 1, &(matrix_info->group_size_info), + scratchpad, scratchpad_size); + } catch (oneapi::mkl::lapack::batch_error const &be) { + std::cerr << "Unexpected exception caught during call to LAPACK API: " + "potrs_batch_scratchpad_size/potrs_batch" + << std::endl + << "reason: " << be.what() << std::endl + << "number: " << be.info() << std::endl; + int i = 0; + auto &ids = be.ids(); + std::vector info_vec(group_size); + for (auto const &e : be.exceptions()) { + try { + std::rethrow_exception(e); + } catch (oneapi::mkl::lapack::exception &e) { + std::cerr << "Exception " << ids[i] << std::endl + << "reason: " << e.what() << std::endl + << "info: " << e.info() << std::endl; + info_vec[i] = e.info(); + i++; + } + } + queue.memcpy(info, info_vec.data(), group_size * sizeof(int)).wait(); + std::free(matrix_info); + if (scratchpad) + sycl::free(scratchpad, queue); + return 1; + } catch (sycl::exception const &e) { + std::cerr << "Caught synchronous SYCL exception:" << std::endl + << "reason: " << e.what() << std::endl; + queue.memset(info, 0, group_size * sizeof(int)).wait(); + std::free(matrix_info); + if (scratchpad) + sycl::free(scratchpad, queue); + return 1; + } + queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(e); + cgh.host_task([=] { + std::free(matrix_info); + sycl::free(scratchpad, queue); + }); + }); + queue.memset(info, 0, group_size * sizeof(int)); + return 0; +#endif +} + +namespace detail { +template