Maratyszcza · Jokeren · Oct 17, 2016 · Oct 18, 2016 · Oct 22, 2016 · Oct 22, 2016
diff --git a/bench/convolution.c b/bench/convolution.c
@@ -57,7 +57,8 @@ struct nnp_profile benchmark_convolution(
 					bias,
 					output,
 					threadpool,
-					&computation_profile[iteration]);
+					&computation_profile[iteration],
+                                        false);
 				break;
 			case mode_input_gradient:
 				nnp_convolution_input_gradient(

diff --git a/bench/vgg.c b/bench/vgg.c
@@ -105,7 +105,7 @@ double benchmark_vgg(
 								layers[layer_index].convolutional_layer.kernel,
 								layers[layer_index].convolutional_layer.bias,
 								layers[layer_index].output,
-								threadpool, NULL);
+								threadpool, NULL, false);
 							break;
 						case layer_type_fully_connected:
 							status = nnp_fully_connected_output(

diff --git a/configure.py b/configure.py
@@ -610,14 +610,26 @@ def main():
         convolution_output_alexnet_test = \
             config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/alexnet.cc")] + gtest_objects,
                 "convolution-output-alexnet-test")
+        convolution_output_alexnet_with_relu_test = \
+            config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/alexnet_with_relu.cc")] + gtest_objects,
+                "convolution-output-alexnet-with-relu-test")
         convolution_output_vgg_a_test = \
             config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/vgg-a.cc")] + gtest_objects,
                 "convolution-output-vgg-a-test")
+        convolution_output_vgg_a_with_relu_test = \
+            config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/vgg-a_with_relu.cc")] + gtest_objects,
+                "convolution-output-vgg-a-test-with-relu-test")
         convolution_output_overfeat_fast_test = \
             config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/overfeat-fast.cc")] + gtest_objects,
                 "convolution-output-overfeat-fast-test")
+        convolution_output_overfeat_fast_with_relu_test = \
+            config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-output/overfeat-fast_with_relu.cc")] + gtest_objects,
+                "convolution-output-overfeat-fast-with-relu-test")
         config.phony("convolution-output-test",
-            [convolution_output_smoke_test, convolution_output_alexnet_test, convolution_output_vgg_a_test, convolution_output_overfeat_fast_test])
+            [convolution_output_smoke_test, convolution_output_alexnet_test, \
+               convolution_output_alexnet_with_relu_test, convolution_output_vgg_a_test, \
+               convolution_output_vgg_a_with_relu_test, convolution_output_overfeat_fast_test, \
+               convolution_output_overfeat_fast_with_relu_test])
 
         convolution_input_gradient_smoke_test = \
             config.unittest(nnpack_objects + reference_layer_objects + [config.cxx("convolution-input-gradient/smoke.cc")] + gtest_objects,

diff --git a/include/nnpack.h b/include/nnpack.h
@@ -187,7 +187,8 @@ enum nnp_status nnp_convolution_output(
 	const float bias[],
 	float output[],
 	pthreadpool_t threadpool,
-	struct nnp_profile* profile);
+	struct nnp_profile* profile,
+        bool relu);
 
 /**
  * @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors.

diff --git a/include/nnpack/reference.h b/include/nnpack/reference.h
@@ -20,7 +20,8 @@ void nnp_convolution_output__reference(
 	const float kernel_pointer[],
 	const float bias[],
 	float output_pointer[],
-	pthreadpool_t threadpool);
+	pthreadpool_t threadpool,
+        bool relu);
 
 void nnp_convolution_input_gradient__reference(
 	size_t batch_size,

diff --git a/src/convolution-output.c b/src/convolution-output.c
@@ -113,6 +113,7 @@ static void compute_input_transform(
 }
 
 struct NNP_CACHE_ALIGN output_transform_context {
+        bool relu;
 	nnp_transform_2d_with_bias transform_function;
 	float* output;
 	const float* output_transform;
@@ -238,6 +239,7 @@ static void compute_matrix_multiplication(
 }
 
 static void compute_convolution_output(
+        bool relu,
 	bool fourier_transform,
 	size_t tuple_elements,
 	size_t batch_size,
@@ -378,6 +380,7 @@ static void compute_convolution_output(
 				.output_size = output_size,
 				.row_count = min(output_tile.height, output_size.height - y),
 				.column_count = min(output_tile.width, output_size.width - x),
+                                .relu = relu,
 			};
 			pthreadpool_compute_2d_tiled(threadpool,
 				(pthreadpool_function_2d_tiled_t) compute_output_transform,
@@ -402,7 +405,8 @@ enum nnp_status nnp_convolution_output(
 	const float bias[],
 	float output[],
 	pthreadpool_t threadpool,
-	struct nnp_profile* profile)
+	struct nnp_profile* profile,
+        bool relu)
 {
 	void* memory_block = NULL;
 	NNP_TOTAL_START(profile)
@@ -531,7 +535,7 @@ enum nnp_status nnp_convolution_output(
 	};
 
 	compute_convolution_output(
-		fourier_transform, tuple_elements,
+                relu, fourier_transform, tuple_elements,
 		batch_size, batch_block_max,batch_subblock_max,
 		input_channels, input_channels_block_max,
 		output_channels, output_channels_block_max, output_channels_subblock_max,

diff --git a/src/ref/convolution-output.c b/src/ref/convolution-output.c
@@ -13,12 +13,18 @@ struct convolution_output_context {
 	const float* kernel_pointer;
 	const float* bias;
 	float* output_pointer;
+        bool relu;
 };
 
+static inline float do_relu(float data, float negative_slope) {
+	return data > 0.0f ? data : data * negative_slope;
+}
+
 static void compute_convolution_output(
 	const struct convolution_output_context context[restrict static 1],
 	size_t sample, size_t output_channel)
 {
+        bool apply_relu = context->relu;
 	const size_t input_channels              = context->input_channels;
 	const size_t output_channels             = context->output_channels;
 	const struct nnp_size input_size         = context->input_size;
@@ -50,7 +56,11 @@ static void compute_convolution_output(
 					}
 				}
 			}
-			output[sample][output_channel][y][x] = v + context->bias[output_channel];
+                        if (apply_relu) {
+                                output[sample][output_channel][y][x] = do_relu(v + context->bias[output_channel], 0.0f);
+                        } else {
+			        output[sample][output_channel][y][x] = v + context->bias[output_channel];
+                        }
 		}
 	}
 }
@@ -67,7 +77,8 @@ void nnp_convolution_output__reference(
 	const float kernel_pointer[],
 	const float bias[],
 	float output_pointer[],
-	pthreadpool_t threadpool)
+	pthreadpool_t threadpool,
+        bool relu)
 {
 	const struct nnp_size output_size = {
 		.width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width) / output_subsampling.width + 1,
@@ -84,7 +95,8 @@ void nnp_convolution_output__reference(
 		.input_pointer = input_pointer,
 		.kernel_pointer = kernel_pointer,
 		.bias = bias,
-		.output_pointer = output_pointer
+		.output_pointer = output_pointer,
+                .relu = relu
 	};
 
 	pthreadpool_compute_2d(threadpool,

diff --git a/src/x86_64-fma/2d-fft-16x16.py b/src/x86_64-fma/2d-fft-16x16.py
@@ -142,6 +142,7 @@
 arg_column_count = Argument(uint32_t, name="column_count")
 arg_row_offset = Argument(uint32_t, name="row_offset")
 arg_column_offset = Argument(uint32_t, name="column_offset")
+arg_relu = Argument(uint32_t, name="relu")
 for with_bias in [False, True]:
     if with_bias:
         ifft16x16_arguments = (arg_f_pointer, arg_t_pointer, arg_bias, arg_f_stride, arg_t_stride, arg_row_count, arg_column_count)
@@ -292,8 +293,7 @@
             CMP(reg_column_end, 8)
             JB(store_columns_8_to_16.end)
 
-            fft16x16.inverse_vfft(reg_t0_column_8, reg_t8_column_8, reg_t_stride, data_in=vfft_columns_8_to_16,
-                reg_row_start=reg_row_start, reg_row_end=reg_row_end, store_mask=store_mask_columns_8_to_16)
-
+            fft16x16.inverse_vfft(reg_t0_column_8, reg_t8_column_8, reg_t_stride, data_in=vfft_columns_8_to_16, \
+                reg_row_start=reg_row_start, reg_row_end=reg_row_end, store_mask=store_mask_columns_8_to_16, relu=arg_relu)
 
         RETURN()
diff --git a/src/x86_64-fma/2d-fft-8x8.py b/src/x86_64-fma/2d-fft-8x8.py
@@ -72,6 +72,7 @@
 arg_row_count = Argument(uint32_t, name="row_count")
 arg_column_offset = Argument(uint32_t, name="column_offset")
 arg_column_count = Argument(uint32_t, name="column_count")
+arg_relu = Argument(uint32_t, name="relu")
 for with_bias in [False, True]:
     if with_bias:
         ifft8x8_arguments = (arg_f_pointer, arg_t_pointer, arg_bias, arg_f_stride, arg_t_stride, arg_row_count, arg_column_count)
@@ -134,6 +135,6 @@
         fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag, transformation="inverse")
         fft.complex_soa_perm_to_real.ifft8_across_rows(ymm_data)
 
-        block8x8.store_packed(ymm_data, reg_t, reg_t_stride, reg_row_count, reg_column_end, reg_row_start, reg_column_start)
+        block8x8.store_packed(ymm_data, reg_t, reg_t_stride, reg_row_count, reg_column_end, reg_row_start, reg_column_start, arg_relu)
 
         RETURN()
diff --git a/src/x86_64-fma/2d-wt-8x8-3x3.py b/src/x86_64-fma/2d-wt-8x8-3x3.py
@@ -153,6 +153,7 @@
 arg_column_count = Argument(uint32_t, name="column_count")
 arg_row_offset = Argument(uint32_t, name="row_offset")
 arg_column_offset = Argument(uint32_t, name="column_offset")
+arg_relu = Argument(uint32_t, name="relu")
 for with_bias in [False, True]:
     if with_bias:
         owt8x8_arguments = (arg_m_pointer, arg_s_pointer, arg_bias, arg_m_stride, arg_s_stride, arg_row_count, arg_column_count)
@@ -202,6 +203,6 @@
 
         ymm_s = winograd.o6x6k3x3.output_transform(ymm_tt)
 
-        block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count)
+        block8x8.store_packed(ymm_s, reg_s, reg_s_stride, reg_row_count, reg_column_count, None, None, arg_relu)
 
         RETURN()
diff --git a/src/x86_64-fma/block8x8.py b/src/x86_64-fma/block8x8.py
@@ -59,7 +59,7 @@ def load_with_padding(ymm_data, reg_data, reg_stride, reg_row_offset, reg_row_co
                     JZ(load_rows.end)
 
 
-def store_packed(ymm_data, reg_data, reg_stride, reg_row_count, reg_column_end, reg_row_offset=None, reg_column_start=None):
+def store_packed(ymm_data, reg_data, reg_stride, reg_row_count, reg_column_end, reg_row_offset=None, reg_column_start=None, relu=False):
     assert isinstance(ymm_data, list) and all(isinstance(ymm_row, YMMRegister) for ymm_row in ymm_data)
     assert isinstance(reg_data, GeneralPurposeRegister64)
     assert isinstance(reg_stride, GeneralPurposeRegister64)
@@ -96,18 +96,24 @@ def store_packed(ymm_data, reg_data, reg_stride, reg_row_count, reg_column_end,
     # stride is in elements; multiply by sizeof(float) to get stride in bytes
     SHL(reg_stride, 2)
 
+    if relu:
+        ymm_zero = YMMRegister()
+        VMOVAPS(ymm_zero, Constant.uint32x8(0))
+
     with Block() as store_rows:
         for i, ymm_row in enumerate(ymm_data):
             with Block() as store_row:
                 if reg_row_offset is not None:
                     CMP(reg_row_offset, i)
                     JA(store_row.end)
 
+                if relu:
+                    VBLENDVPS(ymm_row, ymm_row, ymm_zero, ymm_row)
+
                 VMASKMOVPS([reg_data], ymm_store_mask, ymm_row)
 
                 if ymm_row is not ymm_data[-1]:
                     ADD(reg_data, reg_stride)
 
                     SUB(reg_row_count, 1)
                     JZ(store_rows.end)
-
diff --git a/src/x86_64-fma/fft16x16.py b/src/x86_64-fma/fft16x16.py
@@ -290,7 +290,7 @@ def forward_vfft(reg_t0, reg_t8, reg_t_stride, data_out, reg_row_start=None, reg
     store_ymm_result(out_imag[5], ymm_two_w5_imag)
 
 
-def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_row_end=None, store_mask=None):
+def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_row_end=None, store_mask=None, relu=False):
     assert isinstance(reg_t0, GeneralPurposeRegister64)
     assert isinstance(reg_t8, GeneralPurposeRegister64)
     assert isinstance(reg_t_stride, GeneralPurposeRegister64)
@@ -487,6 +487,7 @@ def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_
     if store_mask:
         VMOVAPS(ymm_store_mask, store_mask)
 
+
     # FFT8: butterfly
     with Block() as store_data:
         for i, (data_lo, data_hi) in enumerate(zip(data[0:8], data[8:16])):
@@ -499,6 +500,10 @@ def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_
                     negate_b=fft8_negate_b.get(id(data_hi), False),
                     writeback=False)
 
+            if relu:
+                ymm_zero = YMMRegister()
+                VMOVAPS(ymm_zero, Constant.uint32x8(0))
+
             with Block() as store_data_lo:
                 if reg_row_start:
                     CMP(reg_row_start, row_lo)
@@ -509,6 +514,8 @@ def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_
                 elif reg_row_end:
                     CMP(reg_row_end, row_lo)
                     JBE(store_data.end)
+                if relu:
+                    VBLENDVPS(ymm_data_lo, ymm_data_lo, ymm_zero, ymm_data_lo)
                 if store_mask:
                     VMASKMOVPS([reg_t0], ymm_store_mask, ymm_data_lo)
                 else:
@@ -523,6 +530,8 @@ def inverse_vfft(reg_t0, reg_t8, reg_t_stride, data_in, reg_row_start=None, reg_
                 if reg_row_end:
                     CMP(reg_row_end, row_hi)
                     JBE(store_data_hi.end)
+                if relu:
+                    VBLENDVPS(ymm_data_hi, ymm_data_hi, ymm_zero, ymm_data_hi)
                 if store_mask:
                     VMASKMOVPS([reg_t8], ymm_store_mask, ymm_data_hi)
                 else: