Skip to content

Commit

Permalink
Training Stability Improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Piotr Sowa authored and Piotr Sowa committed Sep 2, 2018
1 parent ee87e93 commit 6e1c307
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 38 deletions.
4 changes: 1 addition & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ else()
endif()
list (APPEND DARKNET_CFLAGS "-Wfatal-errors")
list (APPEND DARKNET_CFLAGS "-Wno-unused-variable")
list (APPEND DARKNET_CFLAGS "-fPIC")
list (APPEND DARKNET_CFLAGS "-mfpmath=sse")
list (APPEND DARKNET_CFLAGS "-O2")
list (APPEND DARKNET_CFLAGS "-O0")
endif()

# Find OpenCV
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ ifeq ($(DEBUG), 1)
OPTS=-O0 -g
else
ifeq ($(RPI), 1)
OPTS=-O2
OPTS=-O0
else
OPTS=-O2 -mfpmath=sse
OPTS=-O0
endif
endif

Expand Down
2 changes: 1 addition & 1 deletion src/blas_kernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ __kernel void normalize_kernel(int N, __global float *x, __global float *mean, _
int i = (id % spatial);

int index = b*filters*spatial + f*spatial + i;
x[index] = (x[index] - mean[f])/(sqrt(variance[f] + .000001f));
x[index] = (x[index] - mean[f])/(sqrt(variance[f] + .00001f));
}


Expand Down
96 changes: 64 additions & 32 deletions src/opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -434,16 +434,20 @@ void opencl_load_buffer(const char *buffer, const size_t size, cl_program *outpu
printf("opencl_load: could not create program. error: %s\n", clCheckError(clErr));
return;
}

#ifdef RPI
clErr = clBuildProgram(
*output,
1,
&opencl_devices[opencl_device_id_t],
"-cl-denorms-are-zero "
"-cl-std=CL1.2 "
,
NULL, NULL);

NULL, NULL, NULL);
#else
clErr = clBuildProgram(
*output,
1,
&opencl_devices[opencl_device_id_t],
"-cl-fp32-correctly-rounded-divide-sqrt "
"-Werror ", NULL, NULL);
#endif
if (clErr != CL_SUCCESS)
{
printf("opencl_load: could not compile. error: %s\n", clCheckError(clErr));
Expand Down Expand Up @@ -784,7 +788,7 @@ cl_mem_ext opencl_make_array(float *x, size_t n)
buf.ptr = x;

cl_int clErr;
buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
buf.len * buf.obs, buf.ptr, &clErr);
if (clErr != CL_SUCCESS)
printf("could not push array to device. error: %s\n", clCheckError(clErr));
Expand Down Expand Up @@ -818,7 +822,7 @@ cl_mem_ext opencl_make_int_array(int *x, size_t n)
buf.ptr = x;

cl_int clErr;
buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
buf.org = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
buf.len * buf.obs, buf.ptr, &clErr);
if (clErr != CL_SUCCESS)
printf("could not push array to device. error: %s\n", clCheckError(clErr));
Expand Down Expand Up @@ -849,16 +853,24 @@ void opencl_push_int_array(cl_mem_ext x_gpu, int *x, size_t n)
clock_t t;
t = clock();
#endif
cl_int clErr = clEnqueueWriteBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, 0,
(n - x_gpu.off) * x_gpu.obs, x_gpu.ptr, 0, NULL, NULL);
//cl_int clErr = clEnqueueWriteBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, 0,
// (n - x_gpu.off) * x_gpu.obs, x_gpu.ptr, 0, NULL, NULL);
cl_int clErr;
void* map = clEnqueueMapBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, CL_MAP_WRITE,
0, x_gpu.len * x_gpu.obs, 0, NULL, NULL, &clErr);
if (clErr != CL_SUCCESS)
printf("could not map array to device. error: %s\n", clCheckError(clErr));
memcpy(map, x, (n - x_gpu.off) * x_gpu.obs);
clErr = clEnqueueUnmapMemObject(opencl_queues[opencl_device_id_t], x_gpu.mem, map, 0, NULL, NULL);
if (clErr != CL_SUCCESS)
printf("could not unmap array from device. error: %s\n", clCheckError(clErr));
#ifdef BENCHMARK
t = clock() - t;
double time_taken = ((double)t);
printf("%s\t%x\t%d\n", "opencl_push_int_array", x_gpu.ptr, (int)time_taken);
#endif

if (clErr != CL_SUCCESS)
printf("could not push array to device. error: %s\n", clCheckError(clErr));
//if (clErr != CL_SUCCESS)
// printf("could not push array to device. error: %s\n", clCheckError(clErr));
}

void opencl_pull_int_array(cl_mem_ext x_gpu, int *x, size_t n)
Expand All @@ -873,16 +885,24 @@ void opencl_pull_int_array(cl_mem_ext x_gpu, int *x, size_t n)
clock_t t;
t = clock();
#endif
cl_int clErr = clEnqueueReadBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, 0,
(n - x_gpu.off) * x_gpu.obs, x_gpu.ptr, 0, NULL, NULL);
//cl_int clErr = clEnqueueReadBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, 0,
// (n - x_gpu.off) * x_gpu.obs, x_gpu.ptr, 0, NULL, NULL);
cl_int clErr;
void* map = clEnqueueMapBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, CL_MAP_READ,
0, x_gpu.len * x_gpu.obs, 0, NULL, NULL, &clErr);
if (clErr != CL_SUCCESS)
printf("could not map array to device. error: %s\n", clCheckError(clErr));
memcpy(x, map, (n - x_gpu.off) * x_gpu.obs);
clErr = clEnqueueUnmapMemObject(opencl_queues[opencl_device_id_t], x_gpu.mem, map, 0, NULL, NULL);
if (clErr != CL_SUCCESS)
printf("could not unmap array from device. error: %s\n", clCheckError(clErr));
#ifdef BENCHMARK
t = clock() - t;
double time_taken = ((double)t);
printf("%s\t%x\t%d\n", "opencl_pull_int_array", x_gpu.ptr, (int)time_taken);
#endif

if (clErr != CL_SUCCESS)
printf("could not pull array to device. error: %s\n", clCheckError(clErr));
//if (clErr != CL_SUCCESS)
// printf("could not pull array to device. error: %s\n", clCheckError(clErr));
}

void opencl_push_array(cl_mem_ext x_gpu, float *x, size_t n)
Expand All @@ -897,16 +917,24 @@ void opencl_push_array(cl_mem_ext x_gpu, float *x, size_t n)
clock_t t;
t = clock();
#endif
cl_int clErr = clEnqueueWriteBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, 0,
(n - x_gpu.off) * x_gpu.obs, x_gpu.ptr, 0, NULL, NULL);
//cl_int clErr = clEnqueueWriteBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, 0,
// (n - x_gpu.off) * x_gpu.obs, x_gpu.ptr, 0, NULL, NULL);
cl_int clErr;
void* map = clEnqueueMapBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, CL_MAP_WRITE,
0, x_gpu.len * x_gpu.obs, 0, NULL, NULL, &clErr);
if (clErr != CL_SUCCESS)
printf("could not map array to device. error: %s\n", clCheckError(clErr));
memcpy(map, x, (n - x_gpu.off) * x_gpu.obs);
clErr = clEnqueueUnmapMemObject(opencl_queues[opencl_device_id_t], x_gpu.mem, map, 0, NULL, NULL);
if (clErr != CL_SUCCESS)
printf("could not unmap array from device. error: %s\n", clCheckError(clErr));
#ifdef BENCHMARK
t = clock() - t;
double time_taken = ((double)t);
printf("%s\t%x\t%d\n", "opencl_push_array", x_gpu.ptr, (int)time_taken);
#endif

if (clErr != CL_SUCCESS)
printf("could not push array to device. error: %s\n", clCheckError(clErr));
//if (clErr != CL_SUCCESS)
// printf("could not push array to device. error: %s\n", clCheckError(clErr));
}

void opencl_pull_array(cl_mem_ext x_gpu, float *x, size_t n)
Expand All @@ -921,16 +949,24 @@ void opencl_pull_array(cl_mem_ext x_gpu, float *x, size_t n)
clock_t t;
t = clock();
#endif
cl_int clErr = clEnqueueReadBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, 0,
(n - x_gpu.off) * x_gpu.obs, x_gpu.ptr, 0, NULL, NULL);
//cl_int clErr = clEnqueueReadBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, 0,
// (n - x_gpu.off) * x_gpu.obs, x_gpu.ptr, 0, NULL, NULL);
cl_int clErr;
void* map = clEnqueueMapBuffer(opencl_queues[opencl_device_id_t], x_gpu.mem, CL_TRUE, CL_MAP_READ,
0, x_gpu.len * x_gpu.obs, 0, NULL, NULL, &clErr);
if (clErr != CL_SUCCESS)
printf("could not map array to device. error: %s\n", clCheckError(clErr));
memcpy(x, map, (n - x_gpu.off) * x_gpu.obs);
clErr = clEnqueueUnmapMemObject(opencl_queues[opencl_device_id_t], x_gpu.mem, map, 0, NULL, NULL);
if (clErr != CL_SUCCESS)
printf("could not unmap array from device. error: %s\n", clCheckError(clErr));
#ifdef BENCHMARK
t = clock() - t;
double time_taken = ((double)t);
printf("%s\t%x\t%d\n", "opencl_pull_array", x_gpu.ptr, (int)time_taken);
#endif

if (clErr != CL_SUCCESS)
printf("could not pull array to device. error: %s\n", clCheckError(clErr));
//if (clErr != CL_SUCCESS)
// printf("could not pull array to device. error: %s\n", clCheckError(clErr));
}

void opencl_free(cl_mem_ext x_gpu)
Expand Down Expand Up @@ -975,8 +1011,6 @@ cl_mem_ext dec(cl_mem_ext buf, int dec, size_t len) {
}

cl_mem_ext mov(cl_mem_ext buf, size_t len) {
assert(len > 0);

cl_buffer_region region;

region.origin = buf.off * buf.obs;
Expand Down Expand Up @@ -1006,8 +1040,6 @@ cl_mem_ext rem(cl_mem_ext buf, int dec, size_t len) {
}

cl_mem_ext upd(cl_mem_ext buf, size_t len) {
assert(len > 0);

cl_mem_ext ret;

ret.org = buf.org;
Expand Down

0 comments on commit 6e1c307

Please sign in to comment.