Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fma relu combination for convolution-output #31

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bench/convolution.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct nnp_profile benchmark_convolution(
switch (mode) {
case mode_output:
nnp_convolution_output(
nnp_activation_identity,
algorithm,
batch_size,
input_channels,
Expand Down
4 changes: 3 additions & 1 deletion bench/vgg.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ double benchmark_vgg(
for (size_t layer_index = 0; layer_index < layers_count; layer_index++) {
switch (layers[layer_index].type) {
case layer_type_convolutional:
status = nnp_convolution_output(nnp_convolution_algorithm_auto,
status = nnp_convolution_output(
nnp_activation_identity,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the order correct? In include/nnpack.h activation is the second argument

nnp_convolution_algorithm_auto,
batch_size,
layers[layer_index].convolutional_layer.input_channels,
layers[layer_index].convolutional_layer.output_channels,
Expand Down
14 changes: 13 additions & 1 deletion configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,14 +610,26 @@ def main():
convolution_output_alexnet_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/alexnet.cc")] + gtest_objects,
"convolution-output-alexnet-test")
convolution_output_alexnet_with_relu_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/alexnet_with_relu.cc")] + gtest_objects,
"convolution-output-alexnet-with-relu-test")
convolution_output_vgg_a_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/vgg-a.cc")] + gtest_objects,
"convolution-output-vgg-a-test")
convolution_output_vgg_a_with_relu_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/vgg-a_with_relu.cc")] + gtest_objects,
"convolution-output-vgg-a-test-with-relu-test")
convolution_output_overfeat_fast_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/overfeat-fast.cc")] + gtest_objects,
"convolution-output-overfeat-fast-test")
convolution_output_overfeat_fast_with_relu_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/overfeat-fast_with_relu.cc")] + gtest_objects,
"convolution-output-overfeat-fast-with-relu-test")
config.phony("convolution-output-test",
[convolution_output_smoke_test, convolution_output_alexnet_test, convolution_output_vgg_a_test, convolution_output_overfeat_fast_test])
[convolution_output_smoke_test, convolution_output_alexnet_test,
convolution_output_alexnet_with_relu_test, convolution_output_vgg_a_test,
convolution_output_vgg_a_with_relu_test, convolution_output_overfeat_fast_test,
convolution_output_overfeat_fast_with_relu_test])

convolution_input_gradient_smoke_test = \
config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-input-gradient/smoke.cc")] + gtest_objects,
Expand Down
7 changes: 7 additions & 0 deletions include/nnpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,15 @@ enum nnp_status nnp_deinitialize(void);
* @param[out] profile An optional pointer to profiling structure.
* If provided, the structure would record time spent in different phases of the computation.
*/

enum nnp_activation {
nnp_activation_identity = 0,
nnp_activation_relu = 1,
};

enum nnp_status nnp_convolution_output(
enum nnp_convolution_algorithm algorithm,
enum nnp_activation activation,
size_t batch_size,
size_t input_channels,
size_t output_channels,
Expand Down
6 changes: 6 additions & 0 deletions include/nnpack/hwinfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,23 @@ struct transforms {
nnp_transform_2d fft8x8_and_store;
nnp_transform_2d fft8x8_and_stream;
nnp_transform_2d ifft8x8;
nnp_transform_2d ifft8x8_with_relu;
nnp_transform_2d_with_bias ifft8x8_with_bias;
nnp_transform_2d_with_bias ifft8x8_with_bias_with_relu;
nnp_transform_2d fft16x16_and_store;
nnp_transform_2d fft16x16_and_stream;
nnp_transform_2d ifft16x16;
nnp_transform_2d ifft16x16_with_relu;
nnp_transform_2d_with_bias ifft16x16_with_bias;
nnp_transform_2d_with_bias ifft16x16_with_bias_with_relu;
nnp_transform_2d iwt_f6x6_3x3_and_store;
nnp_transform_2d iwt_f6x6_3x3_and_stream;
nnp_transform_2d kwt_f6x6_3x3;
nnp_transform_2d kwt_f6x6_3Rx3R;
nnp_transform_2d owt_f6x6_3x3;
nnp_transform_2d owt_f6x6_3x3_with_relu;
nnp_transform_2d_with_bias owt_f6x6_3x3_with_bias;
nnp_transform_2d_with_bias owt_f6x6_3x3_with_bias_with_relu;
};

struct blockmac {
Expand Down
6 changes: 6 additions & 0 deletions include/nnpack/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,16 @@ typedef void (*nnp_transform_2d_with_bias)(const float*, float*, const float*, s
void nnp_fft8x8_and_store__avx2(const float t[], float f[], size_t stride_t, size_t stride_f, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_fft8x8_and_stream__avx2(const float t[], float f[], size_t stride_t, size_t stride_f, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_ifft8x8__avx2(const float f[], float t[], size_t stride_f, size_t stride_t, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_ifft8x8_with_relu__avx2(const float f[], float t[], size_t stride_f, size_t stride_t, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_ifft8x8_with_bias__avx2(const float f[], float t[], const float bias[], size_t stride_f, size_t stride_t, uint32_t row_count, uint32_t column_count);
void nnp_ifft8x8_with_bias_with_relu__avx2(const float f[], float t[], const float bias[], size_t stride_f, size_t stride_t, uint32_t row_count, uint32_t column_count);

void nnp_fft16x16_and_store__avx2(const float t[], float f[], size_t stride_t, size_t stride_f, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_fft16x16_and_stream__avx2(const float t[], float f[], size_t stride_t, size_t stride_f, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_ifft16x16__avx2(const float f[], float t[], size_t stride_f, size_t stride_t, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_ifft16x16_with_relu__avx2(const float f[], float t[], size_t stride_f, size_t stride_t, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_ifft16x16_with_bias__avx2(const float f[], float t[], const float bias[], size_t stride_f, size_t stride_t, uint32_t row_count, uint32_t column_count);
void nnp_ifft16x16_with_bias_with_relu__avx2(const float f[], float t[], const float bias[], size_t stride_f, size_t stride_t, uint32_t row_count, uint32_t column_count);

void nnp_iwt8x8_3x3_and_store__avx2(const float d[], float wd[], size_t stride_d, size_t stride_wd, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_iwt8x8_3x3_and_stream__avx2(const float d[], float wd[], size_t stride_d, size_t stride_wd, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
Expand All @@ -27,7 +31,9 @@ void nnp_kwt8x8_3x3_and_stream__avx2(const float g[], float wg[], size_t stride_
void nnp_kwt8x8_3Rx3R_and_store__avx2(const float g[], float wg[], size_t stride_g, size_t stride_wg, uint32_t, uint32_t, uint32_t, uint32_t);
void nnp_kwt8x8_3Rx3R_and_stream__avx2(const float g[], float wg[], size_t stride_g, size_t stride_wg, uint32_t, uint32_t, uint32_t, uint32_t);
void nnp_owt8x8_3x3__avx2(const float m[], float s[], size_t stride_m, size_t stride_s, uint32_t row_count, uint32_t column_count, uint32_t, uint32_t);
void nnp_owt8x8_3x3_with_relu__avx2(const float m[], float s[], size_t stride_m, size_t stride_s, uint32_t row_count, uint32_t column_count, uint32_t, uint32_t);
void nnp_owt8x8_3x3_with_bias__avx2(const float m[], float s[], const float bias[], size_t stride_m, size_t stride_s, uint32_t row_count, uint32_t column_count);
void nnp_owt8x8_3x3_with_bias_with_relu__avx2(const float m[], float s[], const float bias[], size_t stride_m, size_t stride_s, uint32_t row_count, uint32_t column_count);

void nnp_fft8x8__psimd(const float t[], float f[], size_t stride_t, size_t stride_f, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
void nnp_fft8x8_and_macc__psimd(const float t[], float f[], const float x[], size_t stride_t, uint32_t row_count, uint32_t column_count, uint32_t row_offset, uint32_t column_offset);
Expand Down
33 changes: 31 additions & 2 deletions src/convolution-output.c
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ static void compute_convolution_output(

enum nnp_status nnp_convolution_output(
enum nnp_convolution_algorithm algorithm,
enum nnp_activation activation,
size_t batch_size,
size_t input_channels,
size_t output_channels,
Expand Down Expand Up @@ -454,14 +455,32 @@ enum nnp_status nnp_convolution_output(
case nnp_convolution_algorithm_ft8x8:
input_transform_function = nnp_hwinfo.transforms.fft8x8_and_stream;
kernel_transform_function = nnp_hwinfo.transforms.fft8x8_and_stream;
output_transform_function = nnp_hwinfo.transforms.ifft8x8_with_bias;
switch (activation) {
case nnp_activation_relu:
output_transform_function = nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu;
break;
case nnp_activation_identity:
output_transform_function = nnp_hwinfo.transforms.ifft8x8_with_bias;
break;
default:
goto cleanup;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nnp_convolution_output should return an error code if activation has unknown value. I suggest to check activation inside validate_convolution_arguments. Then in these switch statements you write NNP_UNREACHABLE; to indicate that this case never happens. Compiler will use it for optimization.

}
transform_tile = (struct nnp_size) { .height = 8, .width = 8 };
fourier_transform = true;
break;
case nnp_convolution_algorithm_ft16x16:
input_transform_function = nnp_hwinfo.transforms.fft16x16_and_stream;
kernel_transform_function = nnp_hwinfo.transforms.fft16x16_and_stream;
output_transform_function = nnp_hwinfo.transforms.ifft16x16_with_bias;
switch (activation) {
case nnp_activation_relu:
output_transform_function = nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu;
break;
case nnp_activation_identity:
output_transform_function = nnp_hwinfo.transforms.ifft16x16_with_bias;
break;
default:
goto cleanup;
}
transform_tile = (struct nnp_size) { .height = 16, .width = 16 };
fourier_transform = true;
break;
Expand All @@ -473,6 +492,16 @@ enum nnp_status nnp_convolution_output(
input_transform_function = nnp_hwinfo.transforms.iwt_f6x6_3x3_and_stream;
kernel_transform_function = nnp_hwinfo.transforms.kwt_f6x6_3x3;
output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias;
switch (activation) {
case nnp_activation_relu:
output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu;
break;
case nnp_activation_identity:
output_transform_function = nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias;
break;
default:
goto cleanup;
}
transform_tile = (struct nnp_size) { .height = 8, .width = 8 };
fourier_transform = false;
break;
Expand Down
6 changes: 6 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -303,17 +303,23 @@ static void init_hwinfo(void) {
nnp_hwinfo.transforms.fft8x8_and_store = nnp_fft8x8_and_store__avx2;
nnp_hwinfo.transforms.fft8x8_and_stream = nnp_fft8x8_and_stream__avx2;
nnp_hwinfo.transforms.ifft8x8 = nnp_ifft8x8__avx2;
nnp_hwinfo.transforms.ifft8x8_with_relu = nnp_ifft8x8_with_relu__avx2;
nnp_hwinfo.transforms.ifft8x8_with_bias = nnp_ifft8x8_with_bias__avx2;
nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = nnp_ifft8x8_with_bias_with_relu__avx2;
nnp_hwinfo.transforms.fft16x16_and_store = nnp_fft16x16_and_store__avx2;
nnp_hwinfo.transforms.fft16x16_and_stream = nnp_fft16x16_and_stream__avx2;
nnp_hwinfo.transforms.ifft16x16 = nnp_ifft16x16__avx2;
nnp_hwinfo.transforms.ifft16x16_with_relu = nnp_ifft16x16_with_relu__avx2;
nnp_hwinfo.transforms.ifft16x16_with_bias = nnp_ifft16x16_with_bias__avx2;
nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = nnp_ifft16x16_with_bias_with_relu__avx2;
nnp_hwinfo.transforms.iwt_f6x6_3x3_and_store = nnp_iwt8x8_3x3_and_store__avx2;
nnp_hwinfo.transforms.iwt_f6x6_3x3_and_stream = nnp_iwt8x8_3x3_and_stream__avx2;
nnp_hwinfo.transforms.kwt_f6x6_3x3 = nnp_kwt8x8_3x3_and_stream__avx2;
nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = nnp_kwt8x8_3Rx3R_and_stream__avx2;
nnp_hwinfo.transforms.owt_f6x6_3x3 = nnp_owt8x8_3x3__avx2;
nnp_hwinfo.transforms.owt_f6x6_3x3_with_relu = nnp_owt8x8_3x3_with_relu__avx2;
nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = nnp_owt8x8_3x3_with_bias__avx2;
nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = nnp_owt8x8_3x3_with_bias_with_relu__avx2;
nnp_hwinfo.blockmac.fourier8x8_mac_with_conj = nnp_ft8x8gemmc__fma3;
nnp_hwinfo.blockmac.fourier16x16_mac_with_conj = nnp_ft16x16gemmc__fma3;
nnp_hwinfo.blockmac.winograd8x8_mac = nnp_s8x8gemm__fma3;
Expand Down
Loading