Skip to content

Commit

Permalink
SYCL: improve Xe2 performance
Browse files Browse the repository at this point in the history
  • Loading branch information
atafra committed Jan 8, 2025
1 parent a777c82 commit 9a1695d
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 33 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Version History

### Changes in v2.3.2:

- Improved performance for Intel Lunar Lake and Battlemage GPUs
- Added Intel Panther Lake GPU support
- Fixed compile error when building with OpenImageIO 3.x

Expand Down
23 changes: 18 additions & 5 deletions devices/sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,17 @@ set(OIDN_SYCL_SOURCES_COMMON
)

set(OIDN_SYCL_SOURCES_ARCH
sycl_conv_xehpc.cpp
sycl_conv_xe2.cpp
sycl_conv_xehpg.cpp
sycl_conv_xelp.cpp
)

if(UNIX)
list(APPEND OIDN_SYCL_SOURCES_ARCH
sycl_conv_xehpc.cpp
)
endif()

add_library(OpenImageDenoise_device_sycl SHARED
${OIDN_SYCL_SOURCES_COMMON}
${OIDN_SYCL_SOURCES_ARCH}
Expand All @@ -110,14 +116,18 @@ if(OIDN_DEVICE_SYCL_AOT)

set(OIDN_SYCL_AOT_TARGETS_XELP tgllp,rkl,adl-s,adl-p,adl-n,dg1,mtl-u,mtl-h)
set(OIDN_SYCL_AOT_TARGETS_XEHPG acm-g10,acm-g11,acm-g12,arl-h)
set(OIDN_SYCL_AOT_TARGETS_XEHPC lnl-m)
set(OIDN_SYCL_AOT_TARGETS_XE2 lnl-m)
if(UNIX)
set(OIDN_SYCL_AOT_TARGETS_XELP ${OIDN_SYCL_AOT_TARGETS_XELP},pvc-vg)
set(OIDN_SYCL_AOT_TARGETS_XEHPC ${OIDN_SYCL_AOT_TARGETS_XEHPC},pvc-sdv,pvc)
set(OIDN_SYCL_AOT_TARGETS_XEHPC pvc-sdv,pvc)
endif()

set(OIDN_SYCL_AOT_TARGETS
${OIDN_SYCL_AOT_TARGETS_XELP},${OIDN_SYCL_AOT_TARGETS_XEHPG},${OIDN_SYCL_AOT_TARGETS_XEHPC}
${OIDN_SYCL_AOT_TARGETS_XELP},${OIDN_SYCL_AOT_TARGETS_XEHPG},${OIDN_SYCL_AOT_TARGETS_XE2}
)
if(OIDN_SYCL_AOT_TARGETS_XEHPC)
set(OIDN_SYCL_AOT_TARGETS ${OIDN_SYCL_AOT_TARGETS},${OIDN_SYCL_AOT_TARGETS_XEHPC})
endif()

macro(oidn_set_sycl_aot_options sources options)
set(_final_options ${options})
Expand All @@ -131,7 +141,10 @@ if(OIDN_DEVICE_SYCL_AOT)
oidn_set_sycl_aot_options(sycl_conv_xelp.cpp "-device ${OIDN_SYCL_AOT_TARGETS_XELP}")
oidn_set_sycl_aot_options(sycl_conv_xehpg.cpp
"-device ${OIDN_SYCL_AOT_TARGETS_XEHPG} -options '-doubleGRF'")
oidn_set_sycl_aot_options(sycl_conv_xehpc.cpp "-device ${OIDN_SYCL_AOT_TARGETS_XEHPC}")
if(OIDN_SYCL_AOT_TARGETS_XEHPC)
oidn_set_sycl_aot_options(sycl_conv_xehpc.cpp "-device ${OIDN_SYCL_AOT_TARGETS_XEHPC}")
endif()
oidn_set_sycl_aot_options(sycl_conv_xe2.cpp "-device ${OIDN_SYCL_AOT_TARGETS_XE2}")
endif()

if(OIDN_DEVICE_SYCL_JIT_CACHE)
Expand Down
4 changes: 2 additions & 2 deletions devices/sycl/sycl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ OIDN_NAMESPACE_BEGIN
using namespace esimd;
using namespace esimdx;

#if defined(OIDN_ARCH_XEHPC)
#if defined(OIDN_ARCH_XEHPC) || defined(OIDN_ARCH_XE2)
constexpr int maxLSCBlockByteSize = 512;
#elif defined(OIDN_ARCH_XEHPG)
constexpr int maxLSCBlockByteSize = 256;
Expand All @@ -31,7 +31,7 @@ OIDN_NAMESPACE_BEGIN
static_assert(byteSize % sizeof(DT) == 0, "unsupported block size");
};

#if defined(OIDN_ARCH_XEHPC) || defined(OIDN_ARCH_XEHPG)
#if defined(OIDN_ARCH_XEHPG) || defined(OIDN_ARCH_XEHPC) || defined(OIDN_ARCH_XE2)

template<typename T, int N>
oidn_inline simd<T, N> loadBlock(const T* ptr)
Expand Down
7 changes: 4 additions & 3 deletions devices/sycl/sycl_conv.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ OIDN_NAMESPACE_BEGIN
Ref<Conv> newSYCLConv(SYCLEngine* engine, const ConvDesc& desc);
}

#if defined(__linux__)
namespace xehpc {
Ref<Conv> newSYCLConv(SYCLEngine* engine, const ConvDesc& desc);
}
/*
namespace xehpc_fast {
#endif

namespace xe2 {
Ref<Conv> newSYCLConv(SYCLEngine* engine, const ConvDesc& desc);
}
*/

OIDN_NAMESPACE_END
27 changes: 9 additions & 18 deletions devices/sycl/sycl_conv_xe.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ OIDN_NAMESPACE_BEGIN
namespace xelp {
#elif defined(OIDN_ARCH_XEHPG)
namespace xehpg {
#elif defined(OIDN_ARCH_XEHPC_FAST)
namespace xehpc_fast {
#elif defined(OIDN_ARCH_XEHPC)
namespace xehpc {
#elif defined(OIDN_ARCH_XE2)
namespace xe2 {
#endif

template<typename SrcDstT, typename WeightT, TensorLayout srcDstLayout, TensorLayout weightLayout,
Expand All @@ -41,6 +41,9 @@ namespace xehpc {
#elif defined(OIDN_ARCH_XEHPC)
using MatmulT = SrcDstT;
static constexpr int blockOH = 4; // block output height
#elif defined(OIDN_ARCH_XE2)
using MatmulT = SrcDstT;
static constexpr int blockOH = 6; // block output height
#else
using MatmulT = float; // no DPAS -> use FP32 FMAs
static constexpr int blockOH = 2; // block output height
Expand All @@ -60,9 +63,6 @@ namespace xehpc {
#if defined(OIDN_ARCH_XEHPG)
// FP32 accumulator rows
simd<float, blockOW * blockAC> accumRows[blockOH][numBlockAC] = {}; // = 0
#elif defined(OIDN_ARCH_XEHPC_FAST)
// Output rows
simd<SrcDstT, blockOW * blockC> outRows[blockOH] = {}; // = 0
#else
// FP32 accumulator rows
simd<float, blockOW * blockC> accumRows[blockOH] = {}; // = 0
Expand Down Expand Up @@ -127,16 +127,7 @@ namespace xehpc {
inRows[(kh + boh) % blockOH].template select<blockOW * blockC, 1>(kw * blockC).read());
}
}
#elif defined(OIDN_ARCH_XEHPC_FAST)
#pragma unroll
for (int boh = 0; boh < blockOH; ++boh)
{
outRows[boh] = xmx::dpas<dpasDepth, dpasRepeat, SrcDstT>(
outRows[boh],
weightMat,
inRows[(kh + boh) % blockOH].template select<blockOW * blockC, 1>(kw * blockC).read());
}
#elif defined(OIDN_ARCH_XEHPC)
#elif defined(OIDN_ARCH_XEHPC) || defined(OIDN_ARCH_XE2)
#pragma unroll
for (int boh = 0; boh < blockOH; ++boh)
{
Expand Down Expand Up @@ -178,7 +169,7 @@ namespace xehpc {
for (int i = 0; i < numBlockAC; ++i)
outRowView.template select<blockOW, 1, blockAC, 1>(0, i * blockAC) = accumRows[boh][i];
}
#elif !defined(OIDN_ARCH_XEHPC_FAST)
#else
// Down-convert accumulator rows to output rows
simd<SrcDstT, blockOW * blockC> outRows[blockOH];

Expand Down Expand Up @@ -385,7 +376,7 @@ namespace xehpc {
#if defined(OIDN_ARCH_XEHPG)
using WeightT = SrcDstT;
static constexpr TensorLayout weightLayout = TensorLayout::OIhw2o8i8o2i;
#elif defined(OIDN_ARCH_XEHPC)
#elif defined(OIDN_ARCH_XEHPC) || defined(OIDN_ARCH_XE2)
using WeightT = SrcDstT;
static constexpr TensorLayout weightLayout = TensorLayout::OIhw8i16o2i;
#else
Expand Down Expand Up @@ -422,7 +413,7 @@ namespace xehpc {
#endif

// Compute the final work-group size
#if defined(OIDN_ARCH_XEHPC)
#if defined(OIDN_ARCH_XEHPC) || defined(OIDN_ARCH_XE2)
const int maxGroupSize = (engine->getArch() == SYCLArch::XeHPC) ? 32 : 8;
#else
const int maxGroupSize = 16;
Expand Down
6 changes: 6 additions & 0 deletions devices/sycl/sycl_conv_xe2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// Copyright 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#define OIDN_ARCH_XE2

#include "sycl_conv_xe.h"
5 changes: 1 addition & 4 deletions devices/sycl/sycl_conv_xehpc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,4 @@

#define OIDN_ARCH_XEHPC

#include "sycl_conv_xe.h"

//#define OIDN_ARCH_XEHPC_FAST
//#include "sycl_conv_xe.h"
#include "sycl_conv_xe.h"
5 changes: 4 additions & 1 deletion devices/sycl/sycl_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,14 @@ OIDN_NAMESPACE_BEGIN
case SYCLArch::XeLPGplus:
case SYCLArch::XeHPG:
return xehpg::newSYCLConv(this, desc);
#if defined(__linux__)
case SYCLArch::XeHPC:
return xehpc::newSYCLConv(this, desc);
#endif
case SYCLArch::Xe2LPG:
case SYCLArch::Xe2HPG:
case SYCLArch::Xe3LPG:
return xehpc::newSYCLConv(this, desc);
return xe2::newSYCLConv(this, desc);
default:
throw std::logic_error("unsupported architecture");
}
Expand Down

0 comments on commit 9a1695d

Please sign in to comment.