Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fma relu combination for convolution-output #31

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bench/convolution.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ struct nnp_profile benchmark_convolution(
bias,
output,
threadpool,
&computation_profile[iteration]);
&computation_profile[iteration],
false);
break;
case mode_input_gradient:
nnp_convolution_input_gradient(
Expand Down
2 changes: 1 addition & 1 deletion bench/vgg.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ double benchmark_vgg(
layers[layer_index].convolutional_layer.kernel,
layers[layer_index].convolutional_layer.bias,
layers[layer_index].output,
threadpool, NULL);
threadpool, NULL, false);
break;
case layer_type_fully_connected:
status = nnp_fully_connected_output(
Expand Down
14 changes: 13 additions & 1 deletion configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,14 +610,26 @@ def main():
convolution_output_alexnet_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/alexnet.cc")] + gtest_objects,
"convolution-output-alexnet-test")
convolution_output_alexnet_with_relu_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/alexnet_with_relu.cc")] + gtest_objects,
"convolution-output-alexnet-with-relu-test")
convolution_output_vgg_a_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/vgg-a.cc")] + gtest_objects,
"convolution-output-vgg-a-test")
convolution_output_vgg_a_with_relu_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/vgg-a_with_relu.cc")] + gtest_objects,
"convolution-output-vgg-a-test-with-relu-test")
convolution_output_overfeat_fast_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/overfeat-fast.cc")] + gtest_objects,
"convolution-output-overfeat-fast-test")
convolution_output_overfeat_fast_with_relu_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/overfeat-fast_with_relu.cc")] + gtest_objects,
"convolution-output-overfeat-fast-with-relu-test")
config.phony("convolution-output-test",
[convolution_output_smoke_test, convolution_output_alexnet_test, convolution_output_vgg_a_test, convolution_output_overfeat_fast_test])
[convolution_output_smoke_test, convolution_output_alexnet_test, \
convolution_output_alexnet_with_relu_test, convolution_output_vgg_a_test, \
convolution_output_vgg_a_with_relu_test, convolution_output_overfeat_fast_test, \
convolution_output_overfeat_fast_with_relu_test])

convolution_input_gradient_smoke_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-input-gradient/smoke.cc")] + gtest_objects,
Expand Down
3 changes: 2 additions & 1 deletion include/nnpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,8 @@ enum nnp_status nnp_convolution_output(
const float bias[],
float output[],
pthreadpool_t threadpool,
struct nnp_profile* profile);
struct nnp_profile* profile,
bool relu);

/**
* @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors.
Expand Down
3 changes: 2 additions & 1 deletion include/nnpack/reference.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ void nnp_convolution_output__reference(
const float kernel_pointer[],
const float bias[],
float output_pointer[],
pthreadpool_t threadpool);
pthreadpool_t threadpool,
bool relu);

void nnp_convolution_input_gradient__reference(
size_t batch_size,
Expand Down
8 changes: 6 additions & 2 deletions src/convolution-output.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ static void compute_input_transform(
}

struct NNP_CACHE_ALIGN output_transform_context {
bool relu;
nnp_transform_2d_with_bias transform_function;
float* output;
const float* output_transform;
Expand Down Expand Up @@ -238,6 +239,7 @@ static void compute_matrix_multiplication(
}

static void compute_convolution_output(
bool relu,
bool fourier_transform,
size_t tuple_elements,
size_t batch_size,
Expand Down Expand Up @@ -378,6 +380,7 @@ static void compute_convolution_output(
.output_size = output_size,
.row_count = min(output_tile.height, output_size.height - y),
.column_count = min(output_tile.width, output_size.width - x),
.relu = relu,
};
pthreadpool_compute_2d_tiled(threadpool,
(pthreadpool_function_2d_tiled_t) compute_output_transform,
Expand All @@ -402,7 +405,8 @@ enum nnp_status nnp_convolution_output(
const float bias[],
float output[],
pthreadpool_t threadpool,
struct nnp_profile* profile)
struct nnp_profile* profile,
bool relu)
{
void* memory_block = NULL;
NNP_TOTAL_START(profile)
Expand Down Expand Up @@ -531,7 +535,7 @@ enum nnp_status nnp_convolution_output(
};

compute_convolution_output(
fourier_transform, tuple_elements,
relu, fourier_transform, tuple_elements,
batch_size, batch_block_max,batch_subblock_max,
input_channels, input_channels_block_max,
output_channels, output_channels_block_max, output_channels_subblock_max,
Expand Down
18 changes: 15 additions & 3 deletions src/ref/convolution-output.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,18 @@ struct convolution_output_context {
const float* kernel_pointer;
const float* bias;
float* output_pointer;
bool relu;
};

static inline float do_relu(float data, float negative_slope) {
return data > 0.0f ? data : data * negative_slope;
}

static void compute_convolution_output(
const struct convolution_output_context context[restrict static 1],
size_t sample, size_t output_channel)
{
bool apply_relu = context->relu;
const size_t input_channels = context->input_channels;
const size_t output_channels = context->output_channels;
const struct nnp_size input_size = context->input_size;
Expand Down Expand Up @@ -50,7 +56,11 @@ static void compute_convolution_output(
}
}
}
output[sample][output_channel][y][x] = v + context->bias[output_channel];
if (apply_relu) {
output[sample][output_channel][y][x] = do_relu(v + context->bias[output_channel], 0.0f);
} else {
output[sample][output_channel][y][x] = v + context->bias[output_channel];
}
}
}
}
Expand All @@ -67,7 +77,8 @@ void nnp_convolution_output__reference(
const float kernel_pointer[],
const float bias[],
float output_pointer[],
pthreadpool_t threadpool)
pthreadpool_t threadpool,
bool relu)
{
const struct nnp_size output_size = {
.width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width) / output_subsampling.width + 1,
Expand All @@ -84,7 +95,8 @@ void nnp_convolution_output__reference(
.input_pointer = input_pointer,
.kernel_pointer = kernel_pointer,
.bias = bias,
.output_pointer = output_pointer
.output_pointer = output_pointer,
.relu = relu
};

pthreadpool_compute_2d(threadpool,
Expand Down
6 changes: 3 additions & 3 deletions src/x86_64-fma/2d-fft-16x16.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@
arg_column_count = Argument(uint32_t, name="column_count")
arg_row_offset = Argument(uint32_t, name="row_offset")
arg_column_offset = Argument(uint32_t, name="column_offset")
arg_relu = Argument(uint32_t, name="relu")
for with_bias in [False, True]:
if with_bias:
ifft16x16_arguments = (arg_f_pointer, arg_t_pointer, arg_bias, arg_f_stride, arg_t_stride, arg_row_count, arg_column_count)
Expand Down Expand Up @@ -292,8 +293,7 @@
CMP(reg_column_end, 8)
JB(store_columns_8_to_16.end)

fft16x16.inverse_vfft(reg_t0_column_8, reg_t8_column_8, reg_t_stride, data_in=vfft_columns_8_to_16,
reg_row_start=reg_row_start, reg_row_end=reg_row_end, store_mask=store_mask_columns_8_to_16)

fft16x16.inverse_vfft(reg_t0_column_8, reg_t8_column_8, reg_t_stride, data_in=vfft_columns_8_to_16, \
reg_row_start=reg_row_start, reg_row_end=reg_row_end, store_mask=store_mask_columns_8_to_16, relu=arg_relu)

RETURN()
3 changes: 2 additions & 1 deletion src/x86_64-fma/2d-fft-8x8.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
arg_row_count = Argument(uint32_t, name="row_count")
arg_column_offset = Argument(uint32_t, name="column_offset")
arg_column_count = Argument(uint32_t, name="column_count")
arg_relu = Argument(uint32_t, name="relu")
for with_bias in [False, True]:
if with_bias:
ifft8x8_arguments = (arg_f_pointer, arg_t_pointer, arg_bias, arg_f_stride, arg_t_stride, arg_row_count, arg_column_count)
Expand Down Expand Up @@ -134,6 +135,6 @@
fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag, transformation="inverse")
fft.complex_soa_perm_to_real.ifft8_across_rows(ymm_data)

block8x8.store_packed(ymm_data, reg_t, reg_t_stride, reg_row_count, reg_column_end, reg_row_start, reg_column_start)
block8x8.store_packed(ymm_data, reg_t, reg_t_stride, reg_row_count, reg_column_end, reg_row_start, reg_column_start, arg_relu)

RETURN()
3 changes: 2 additions & 1 deletion src/x86_64-fma/2d-wt-8x8-3x3.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@
arg_column_count = Argument(uint32_t, name="column_count")
arg_row_offset = Argument(uint32_t, name="row_offset")
arg_column_offset = Argument(uint32_t, name="column_offset")
arg_relu = Argument(uint32_t, name="relu")
for with_bias in [False, True]:
if with_bias:
owt8x8_arguments = (arg_m_pointer, arg_s_pointer, arg_bias, arg_m_stride, arg_s_stride, arg_row_count, arg_column_count)
Expand Down Expand Up @@ -202,6 +203,6 @@

ymm_s = winograd.o6x6k3x3.output_transform(ymm_tt)

block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count)
block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count, None, None, arg_relu)

RETURN()
10 changes: 8 additions & 2 deletions src/x86_64-fma/block8x8.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def load_with_padding(ymm_data, reg_data, reg_stride, reg_row_offset, reg_row_co
JZ(load_rows.end)


def store_packed(ymm_data, reg_data, reg_stride, reg_row_count, reg_column_end, reg_row_offset=None, reg_column_start=None):
def store_packed(ymm_data, reg_data, reg_stride, reg_row_count, reg_column_end, reg_row_offset=None, reg_column_start=None, relu=False):
assert isinstance(ymm_data, list) and all(isinstance(ymm_row, YMMRegister) for ymm_row in ymm_data)
assert isinstance(reg_data, GeneralPurposeRegister64)
assert isinstance(reg_stride, GeneralPurposeRegister64)
Expand Down Expand Up @@ -96,18 +96,24 @@ def store_packed(ymm_data, reg_data, reg_stride, reg_row_count, reg_column_end,
# stride is in elements; multiply by sizeof(float) to get stride in bytes
SHL(reg_stride, 2)

if relu:
ymm_zero = YMMRegister()
VMOVAPS(ymm_zero, Constant.uint32x8(0))

with Block() as store_rows:
for i, ymm_row in enumerate(ymm_data):
with Block() as store_row:
if reg_row_offset is not None:
CMP(reg_row_offset, i)
JA(store_row.end)

if relu:
VBLENDVPS(ymm_row, ymm_row, ymm_zero, ymm_row)

VMASKMOVPS([reg_data], ymm_store_mask, ymm_row)

if ymm_row is not ymm_data[-1]:
ADD(reg_data, reg_stride)

SUB(reg_row_count, 1)
JZ(store_rows.end)

11 changes: 10 additions & 1 deletion src/x86_64-fma/fft16x16.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def forward_vfft(reg_t0, reg_t8, reg_t_stride, data_out, reg_row_start=None, reg
store_ymm_result(out_imag[5], ymm_two_w5_imag)


def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_row_end=None, store_mask=None):
def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_row_end=None, store_mask=None, relu=False):
assert isinstance(reg_t0, GeneralPurposeRegister64)
assert isinstance(reg_t8, GeneralPurposeRegister64)
assert isinstance(reg_t_stride, GeneralPurposeRegister64)
Expand Down Expand Up @@ -487,6 +487,7 @@ def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_
if store_mask:
VMOVAPS(ymm_store_mask, store_mask)


# FFT8: butterfly
with Block() as store_data:
for i, (data_lo, data_hi) in enumerate(zip(data[0:8], data[8:16])):
Expand All @@ -499,6 +500,10 @@ def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_
negate_b=fft8_negate_b.get(id(data_hi), False),
writeback=False)

if relu:
ymm_zero = YMMRegister()
VMOVAPS(ymm_zero, Constant.uint32x8(0))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use negative signed zero, i.e. Constant.float32x8(-0.0)

Copy link
Contributor Author

@Jokeren Jokeren Oct 22, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please tell me the reason for using negative signed zero? Or propose a simple example?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for the backward pass. The backward pass needs to know which values were positive/negative before we applied ReLU. Using negative zero ensures that the sign of the convolution results doesn't change after we apply ReLU. See discussion in #24 for why its important.


with Block() as store_data_lo:
if reg_row_start:
CMP(reg_row_start, row_lo)
Expand All @@ -509,6 +514,8 @@ def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_
elif reg_row_end:
CMP(reg_row_end, row_lo)
JBE(store_data.end)
if relu:
VBLENDVPS(ymm_data_lo, ymm_data_lo, ymm_zero, ymm_data_lo)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is preferable to use VMAXPS(ymm_data_lo, ymm_data_lo, ymm_zero) for performance reasons (VBLENDPS may generate multiple microoperations)

if store_mask:
VMASKMOVPS([reg_t0], ymm_store_mask, ymm_data_lo)
else:
Expand All @@ -523,6 +530,8 @@ def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_
if reg_row_end:
CMP(reg_row_end, row_hi)
JBE(store_data_hi.end)
if relu:
VBLENDVPS(ymm_data_hi, ymm_data_hi, ymm_zero, ymm_data_hi)
if store_mask:
VMASKMOVPS([reg_t8], ymm_store_mask, ymm_data_hi)
else:
Expand Down
Loading