Skip to content

Commit

Permalink
Merge pull request #38 from lujnan/experimental
Browse files Browse the repository at this point in the history
fix 'booster' vsub & vmul, the 'i' index increase error.
  • Loading branch information
Mengjintao authored Apr 16, 2019
2 parents dd864f7 + 0270846 commit 5023303
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
6 changes: 3 additions & 3 deletions src/booster/arm/generic_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ template void add_relu<false>(float* dst, const float* A, const float* B, const
void vsub(float* dst, float* A, float* B, size_t len, size_t num_threads)
{
#pragma omp parallel for num_threads(num_threads) schedule(static)
for (int i = 0; i < len - 4; ++i)
for (int i = 0; i < len; i += 4)
{
float32x4_t vA = vld1q_f32(A + i);
float32x4_t vB = vld1q_f32(B + i);
Expand All @@ -188,7 +188,7 @@ void vsub(float* dst, float* A, float* B, size_t len, size_t num_threads)
void vmul(float* dst, float* A, float* B, size_t len, size_t num_threads)
{
#pragma omp parallel for num_threads(num_threads) schedule(static)
for (int i = 0; i < len - 4; ++i)
for (int i = 0; i < len; i += 4)
{
float32x4_t vA = vld1q_f32(A + i);
float32x4_t vB = vld1q_f32(B + i);
Expand Down Expand Up @@ -459,4 +459,4 @@ void reluVecOpenmp(float* arr, int len, int nThreads)
for (int i = aLen; i < len; i++)
if (arr[i] < 0) arr[i] = 0;
}
};
};
4 changes: 2 additions & 2 deletions src/booster/avx/generic_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ template void add_relu<false>(float* dst, const float* A, const float* B, const
void vsub(float* dst, float* A, float* B, size_t len, size_t num_threads)
{
#pragma omp parallel for num_threads(num_threads) schedule(static)
for (int i = 0; i < len - 4; ++i)
for (int i = 0; i < len; i += 4)
{
__m128 vA = _mm_load_ps(A + i);
__m128 vB = _mm_load_ps(B + i);
Expand All @@ -188,7 +188,7 @@ void vsub(float* dst, float* A, float* B, size_t len, size_t num_threads)
void vmul(float* dst, float* A, float* B, size_t len, size_t num_threads)
{
#pragma omp parallel for num_threads(num_threads) schedule(static)
for (int i = 0; i < len - 4; ++i)
for (int i = 0; i < len; i += 4)
{
__m128 vA = _mm_load_ps(A + i);
__m128 vB = _mm_load_ps(B + i);
Expand Down

0 comments on commit 5023303

Please sign in to comment.