Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backward of gamma and beta for layernorm and groupnorm #1013

Merged
merged 16 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions example/53_layernorm_bwd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_example_executable(example_layernorm2d_bwd_fp16 layernorm2d_bwd_fp16.cpp)
165 changes: 165 additions & 0 deletions example/53_layernorm_bwd/layernorm2d_bwd_fp16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>

#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"

#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm_bwd.hpp"

using DYDataType = ck::half_t;
using XDataType = ck::half_t;
using GammaDataType = ck::half_t;
using MeanInvStdDataType = float;
using DGammaDataType = ck::half_t;
using DBetaDataType = ck::half_t;
using DXDataType = ck::half_t;
using ComputeDataType = float;

constexpr int Rank = 2;
constexpr int NumReduceDim = 1;

// Layernorm:

// Input shape
// dy: [M, N]
// x: [M, N]
// mean: [M, 1]
// inv_std: [M, 1]

// Output shape
// dgamma: [1, N]
// dbeta: [1, N]

// dgamma = reduce_sum(dy * (x - mean) * inv_std, axis=0)
// dbeta = reduce_sum(dy, axis=0)

// [CAUSION]
// In DeviceNormalizationBwdGammaBetaImpl, M is invarient dimension, K is reduced dimension
// Hence, M in this example and DeviceNormalizationBwdGammaBetaImpl is different
using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdGammaBetaImpl<
DYDataType,
XDataType,
MeanInvStdDataType,
ComputeDataType,
DGammaDataType,
DBetaDataType,
Rank,
NumReduceDim,
256, // BlockSize
8, // ClusterInvarient
32, // ClusterReduce
8, // SliceInvarient
1, // SliceReduce
false, // IsDYSrcVectorDimReduced
8, // DYSrcVectorSize
false, // IsXSrcVectorDimReduced
8, // XSrcVectorSize
true, // IsMeanInvStdSrcVectorDimReduced
1, // MeanInvStdSrcVectorSize
1, // DGammaDstVectorSize
1>; // DBetaDstVectorSize

int main()
{
bool time_kernel = false;

ck::index_t M = 1024;
ck::index_t N = 512;

Tensor<DYDataType> dy({M, N});
Tensor<XDataType> x({M, N});
Tensor<GammaDataType> gamma({N});
Tensor<MeanInvStdDataType> mean({M});
Tensor<MeanInvStdDataType> inv_std({M});

Tensor<DGammaDataType> dgamma({N});
Tensor<DBetaDataType> dbeta({N});
Tensor<DXDataType> dx({M, N});

dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0.0, 1.0});
x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});
inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});

DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());

dy_dev.ToDevice(dy.mData.data());
x_dev.ToDevice(x.mData.data());
mean_dev.ToDevice(mean.mData.data());
inv_std_dev.ToDevice(inv_std.mData.data());

auto gamma_beta_device_instance = GammaBetaDeviceInstance{};
auto gamma_beta_argument_ptr =
gamma_beta_device_instance.MakeArgumentPointer({M, N}, // inLengths
{N, 1}, // dyStrides
{N, 1}, // xStrides
{1, 0}, // meanStrides
{1, 0}, // invStdStrides
{N}, // outLengths
{1}, // dgammaStrides
{1}, // dbetaStrides
{0}, // reduceDims
dy_dev.GetDeviceBuffer(),
x_dev.GetDeviceBuffer(),
mean_dev.GetDeviceBuffer(),
inv_std_dev.GetDeviceBuffer(),
dgamma_dev.GetDeviceBuffer(),
dbeta_dev.GetDeviceBuffer());

if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get()))
{
std::cout << "The runtime parameters are not supported" << std::endl;
return 1;
};

auto gamma_beta_invoker_ptr = gamma_beta_device_instance.MakeInvokerPointer();
gamma_beta_invoker_ptr->Run(gamma_beta_argument_ptr.get(), StreamConfig{nullptr, time_kernel});

bool pass = true;
{
Tensor<DGammaDataType> host_dgamma({N});
Tensor<DBetaDataType> host_dbeta({N});
Tensor<DXDataType> host_dx({M, N});
using ReferenceInstance =
ck::tensor_operation::host::ReferenceLayernormBwd<DYDataType,
XDataType,
GammaDataType,
MeanInvStdDataType,
DGammaDataType,
DBetaDataType,
DXDataType,
ComputeDataType>;

ReferenceInstance ref;
auto ref_argument =
ref.MakeArgument(dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, {M, N});
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);

dgamma_dev.FromDevice(dgamma.mData.data());
dbeta_dev.FromDevice(dbeta.mData.data());

pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
}

return (pass ? 0 : 1);
}
1 change: 1 addition & 0 deletions example/54_groupnorm_bwd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
add_example_executable(example_groupnorm_bwd_fp16 groupnorm_bwd_fp16.cpp)
167 changes: 167 additions & 0 deletions example/54_groupnorm_bwd/groupnorm_bwd_fp16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>

#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"

#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_normalization_bwd_gamma_beta_impl.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm_bwd.hpp"

using DYDataType = ck::half_t;
using XDataType = ck::half_t;
using GammaDataType = ck::half_t;
using MeanInvStdDataType = float;
using DGammaDataType = ck::half_t;
using DBetaDataType = ck::half_t;
using DXDataType = ck::half_t;
using ComputeDataType = float;

constexpr int Rank = 5;
constexpr int NumReduceDim = 3;

// Grouprnorm
// kernel: M , K
// dy: N, H, W, G, C -> G * C, N * H * W
// x: N, H, W, G, C -> G * C, N * H * W
// mean: N, 1, 1, G, 1 -> G * 1, N * 1 * 1
// rstd: N, 1, 1, G, 1 -> G * 1, N * 1 * 1

// dgamma: 1, 1, 1, G, C -> G * C
// dbeta: 1, 1, 1, G, C -> G * C

// reduced axis: 0, 1, 2

using GammaBetaDeviceInstance = ck::tensor_operation::device::DeviceNormalizationBwdGammaBetaImpl<
DYDataType,
XDataType,
MeanInvStdDataType,
ComputeDataType,
DGammaDataType,
DBetaDataType,
Rank,
NumReduceDim,
256, // BlockSize
8, // ClusterInvarient
32, // ClusterReduce
8, // SliceInvarient
1, // SliceReduce
false, // IsDYSrcVectorDimReduced
8, // DYSrcVectorSize
false, // IsXSrcVectorDimReduced
8, // XSrcVectorSize
false, // IsMeanInvStdSrcVectorDimReduced
1, // MeanInvStdSrcVectorSize
1, // DGammaDstVectorSize
1>; // DBetaDstVectorSize

int main()
{
bool time_kernel = false;

ck::index_t N = 16;
ck::index_t H = 16;
ck::index_t W = 16;
ck::index_t G = 32;
ck::index_t C = 64;

Tensor<DYDataType> dy({N, H, W, G, C});
Tensor<XDataType> x({N, H, W, G, C});
Tensor<GammaDataType> gamma({G, C});
Tensor<MeanInvStdDataType> mean({N, G});
Tensor<MeanInvStdDataType> inv_std({N, G});

Tensor<DGammaDataType> dgamma({G, C});
Tensor<DBetaDataType> dbeta({G, C});
Tensor<DXDataType> dx({N, H, W, G, C});

dy.GenerateTensorValue(GeneratorTensor_3<DYDataType>{0.0, 1.0});
x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
mean.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});
inv_std.GenerateTensorValue(GeneratorTensor_3<MeanInvStdDataType>{0.0, 1.0});

DeviceMem dy_dev(sizeof(DYDataType) * dy.mDesc.GetElementSpaceSize());
DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
DeviceMem mean_dev(sizeof(MeanInvStdDataType) * mean.mDesc.GetElementSpaceSize());
DeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * inv_std.mDesc.GetElementSpaceSize());
DeviceMem dgamma_dev(sizeof(DGammaDataType) * dgamma.mDesc.GetElementSpaceSize());
DeviceMem dbeta_dev(sizeof(DBetaDataType) * dbeta.mDesc.GetElementSpaceSize());

dy_dev.ToDevice(dy.mData.data());
x_dev.ToDevice(x.mData.data());
mean_dev.ToDevice(mean.mData.data());
inv_std_dev.ToDevice(inv_std.mData.data());

std::vector<ck::index_t> dyStrides{dy.mDesc.GetStrides().begin(), dy.mDesc.GetStrides().end()};
std::vector<ck::index_t> xStrides{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
std::vector<ck::index_t> meanStrides = {G, 0, 0, 1, 0};
std::vector<ck::index_t> invStdStrides = {G, 0, 0, 1, 0};

auto gamma_beta_device_instance = GammaBetaDeviceInstance{};
auto gamma_beta_argument_ptr =
gamma_beta_device_instance.MakeArgumentPointer({N, H, W, G, C}, // inLengths
dyStrides, // dyStrides
xStrides, // xStrides
meanStrides, // meanStrides
invStdStrides, // invStdStrides
{G, C}, // outLengths
{C, 1}, // dgammaStrides
{C, 1}, // dbetaStrides
{0, 1, 2}, // reduceDims
dy_dev.GetDeviceBuffer(),
x_dev.GetDeviceBuffer(),
mean_dev.GetDeviceBuffer(),
inv_std_dev.GetDeviceBuffer(),
dgamma_dev.GetDeviceBuffer(),
dbeta_dev.GetDeviceBuffer());

if(!gamma_beta_device_instance.IsSupportedArgument(gamma_beta_argument_ptr.get()))
{
std::cout << "The runtime parameters are not supported" << std::endl;
return 1;
};

auto gamma_beta_invoker_ptr = gamma_beta_device_instance.MakeInvokerPointer();
gamma_beta_invoker_ptr->Run(gamma_beta_argument_ptr.get(), StreamConfig{nullptr, time_kernel});

bool pass = true;
{
Tensor<DGammaDataType> host_dgamma({G, C});
Tensor<DBetaDataType> host_dbeta({G, C});
Tensor<DXDataType> host_dx({N, H, W, G, C});
using ReferenceInstance =
ck::tensor_operation::host::ReferenceGroupnormBwd<DYDataType,
XDataType,
GammaDataType,
MeanInvStdDataType,
DGammaDataType,
DBetaDataType,
DXDataType,
ComputeDataType>;

ReferenceInstance ref;
auto ref_argument = ref.MakeArgument(
dy, x, gamma, mean, inv_std, host_dgamma, host_dbeta, host_dx, {N, H, W, G, C});
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);

dgamma_dev.FromDevice(dgamma.mData.data());
dbeta_dev.FromDevice(dbeta.mData.data());

pass &= ck::utils::check_err(dgamma, host_dgamma, "Error: Incorrect dgamma", 1e-3, 1e-3);
pass &= ck::utils::check_err(dbeta, host_dbeta, "Error: Incorrect dbeta", 1e-3, 1e-3);
}

return (pass ? 0 : 1);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

#pragma once

#include <iostream>
#include <vector>

#include "ck/tensor_operation/gpu/device/device_base.hpp"

namespace ck {
namespace tensor_operation {
namespace device {
template <typename DYDataType,
typename XDataType,
typename MeanInvStdDataType,
typename DGammaDataType,
typename DBetaDataType,
index_t Rank,
index_t NumReduceDim>
struct DeviceNormalizationBwdGammaBeta : public BaseOperator
{
virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const std::vector<index_t> inLengths,
const std::vector<index_t> dyStrides,
const std::vector<index_t> xStrides,
const std::vector<index_t> meanStrides,
const std::vector<index_t> invStdStrides,
const std::vector<index_t> outLengths,
const std::vector<index_t> dgammaStrides,
const std::vector<index_t> dbetaStrides,
const std::vector<index_t> reduceDims,
const void* p_dy,
const void* p_x,
const void* p_mean,
const void* p_invStd,
void* p_dgamma,
void* p_dbeta) = 0;

virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};

template <typename DYDataType,
typename XDataType,
typename MeanInvStdDataType,
typename DGammaDataType,
typename DBetaDataType,
index_t Rank,
index_t NumReduceDim>
using DeviceNormalizationBwdGammaBetaPtr =
std::unique_ptr<DeviceNormalizationBwdGammaBeta<DYDataType,
XDataType,
MeanInvStdDataType,
DGammaDataType,
DBetaDataType,
Rank,
NumReduceDim>>;

} // namespace device
} // namespace tensor_operation
} // namespace ck
Loading