Skip to content

Commit

Permalink
[apps] 🎨 Clang-format pass
Browse files Browse the repository at this point in the history
  • Loading branch information
mp-17 committed Nov 29, 2023
1 parent 598ee89 commit d900f6a
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 80 deletions.
11 changes: 5 additions & 6 deletions apps/dwt/kernel/wavelet.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
#include <stdio.h>

// Reduce scalar code overhead for problems that can fit with LMUL == 4.
// The worst case if with 2 lanes, in which the problem size should be lower than
// 2k float numbers.
// The worst case if with 2 lanes, in which the problem size should be lower
// than 2k float numbers.
#define SMALL_PROBLEM

extern int64_t event_trigger;
Expand Down Expand Up @@ -110,10 +110,9 @@ static inline void dwt_step_vector(const gsl_wavelet *w, float *samples,
// Segment load the vectors. ToDo: check if vl/2 is correct
vlseg2e32_v_f32m4(sample_vec_0, sample_vec_1, samples_r, vl / 2);
#else
// Strided load (inefficient!)
sample_vec_0 = vlse32_v_f32m4(samples_r, 2 * sizeof(*samples_r), vl / 2);
sample_vec_1 =
vlse32_v_f32m4(samples_r + 1, 2 * sizeof(*samples_r), vl / 2);
// Strided load (inefficient!)
sample_vec_0 = vlse32_v_f32m4(samples_r, 2 * sizeof(*samples_r), vl / 2);
sample_vec_1 = vlse32_v_f32m4(samples_r + 1, 2 * sizeof(*samples_r), vl / 2);
#endif

// First implementation
Expand Down
4 changes: 2 additions & 2 deletions apps/fconv3d/fconv3d.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ void fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N,
int64_t n_, int64_t C, int64_t F);

void fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t N,
int64_t C, int64_t F);
int64_t C, int64_t F);

void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
int64_t n_, int64_t C, int64_t F);
int64_t n_, int64_t C, int64_t F);

#define MIN(a, b) ((a) < (b) ? (a) : (b))

Expand Down
59 changes: 25 additions & 34 deletions apps/fconv3d/fconv3d_3x7x7.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N,
}

void fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t N,
int64_t C, int64_t F) {
int64_t C, int64_t F) {

unsigned long int block_size_n;

Expand Down Expand Up @@ -906,9 +906,8 @@ void fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N,
asm volatile("vse64.v v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo));
}


void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
int64_t n_, int64_t C, int64_t F) {
int64_t n_, int64_t C, int64_t F) {

// Helper variables
int64_t ldo = N << 3;
Expand Down Expand Up @@ -1006,7 +1005,6 @@ void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1);
i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1);


// Main kernel, unrolled by 2
for (int k = 0; k < F / 2; ++k) {
// Two base indexes because of the unrolling
Expand Down Expand Up @@ -1042,7 +1040,6 @@ void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++));

asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++));

}

// The very last iterations require mixing the instructions with the store
Expand All @@ -1059,7 +1056,6 @@ void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
// Reuse preloaded coefficients
// Buffer the next coefficients for faster use


// Bump the input ptr
i_ += 3 * (N + F - 1);

Expand Down Expand Up @@ -1124,43 +1120,41 @@ void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,

if (ch != C - 1) {
int64_t base_idx_0 = (ch + 1) * fch_len;

}
}
}
}

// Bump the input ptr
i_ += N + F - 1;
// Bump the input ptr
i_ += N + F - 1;

#ifdef VCD_DUMP
// Stop dumping VCD
event_trigger = -1;
// Stop dumping VCD
event_trigger = -1;
#endif

//////////////
// UNROLL 1 //
//////////////
//////////////
// UNROLL 1 //
//////////////

// Loop on the channels
for (int ch = 0; ch < C; ++ch) {
// Loop on the channels
for (int ch = 0; ch < C; ++ch) {

// Point to the first element of the channel ch
i__ = i_ + ch * ich_len;
// Point to the first element of the channel ch
i__ = i_ + ch * ich_len;

// Start calculating the next pointers to the elements to be slided in
i_slide_ptr_1 = i__ + n_;
// Start calculating the next pointers to the elements to be slided in
i_slide_ptr_1 = i__ + n_;

for (int k = 0; k < F / 2; ++k) {
// Two base indexes because of the unrolling
// Point to the first element of the current column (k) of the current
// channel (ch) of the filter (f)
int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len);
// Point to the first element of the current column (k+1) of the current
// channel (ch) of the filter (f)
int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len);
for (int k = 0; k < F / 2; ++k) {
// Two base indexes because of the unrolling
// Point to the first element of the current column (k) of the current
// channel (ch) of the filter (f)
int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len);
// Point to the first element of the current column (k+1) of the current
// channel (ch) of the filter (f)
int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len);
}


// Bump the input ptr
i_ += N + F - 1;
}
Expand Down Expand Up @@ -1196,7 +1190,7 @@ void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
if ((k | ch) == 0)
asm volatile("vfmul.vf v28, v0, %0" ::"f"(f[0 + base_idx_0]));
else
asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++));
asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++));
asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++));
asm volatile("vfslide1down.vf v14, v12, %0" ::"f"(*i_slide_ptr_3++));

Expand All @@ -1205,10 +1199,8 @@ void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++));
asm volatile("vfslide1down.vf v12, v14, %0" ::"f"(*i_slide_ptr_3++));
}

}


// Bump the input ptr
i_ += 4 * (N + F - 1);

Expand Down Expand Up @@ -1251,7 +1243,6 @@ void fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N,
}
}


/*
////////////////////
// MAIN ALGOMITHM //
Expand Down
66 changes: 30 additions & 36 deletions apps/jacobi2d/kernel/jacobi2d.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,11 @@ void j2d_kernel_adhoc_warm(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
asm volatile("vsetvli %0, %1, e64, m4, ta, ma"
: "=r"(gvl)
: "r"(size_x - j + 1));
mtx_ptr_0 = j; // 0 * c + j
mtx_ptr_0 = j; // 0 * c + j
asm volatile("vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top
mtx_ptr_1 = j + c; // 1 * c + j
mtx_ptr_1 = j + c; // 1 * c + j
asm volatile("vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_1])); // v4 middle
mtx_ptr_0 = mtx_ptr_1 + c; // 2 * c + j
mtx_ptr_0 = mtx_ptr_1 + c; // 2 * c + j
asm volatile("vle64.v v8, (%0)" ::"r"(&A[mtx_ptr_0])); // v8 bottom

// Look ahead and load the next coefficients
Expand Down Expand Up @@ -201,16 +201,15 @@ void j2d_kernel_adhoc_warm(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
asm volatile("vfslide1up.vf v24, v4, %0" ::"f"(izq_0));
asm volatile("vfslide1down.vf v28, v4, %0" ::"f"(der_0));
asm volatile("vfadd.vv v12, v4, v0"); // middle - top
mtx_ptr_0 += c; // (i + 2) * c + j
mtx_ptr_0 += c; // (i + 2) * c + j
asm volatile("vfadd.vv v12, v12, v8"); // bottom
sc_ptr_0 += c; // (i + 1) * c + j - 1
sc_ptr_0 += c; // (i + 1) * c + j - 1
asm volatile("vfadd.vv v12, v12, v24"); // left
if ((i + 1) <= size_y) {
asm volatile(
"vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top
asm volatile("vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top
}
asm volatile("vfadd.vv v12, v12, v28"); // right
sc_ptr_1 += c; // (i + 1) * c + j + gvl
sc_ptr_1 += c; // (i + 1) * c + j + gvl
asm volatile("vfmul.vf v12, v12, %0" ::"f"(five_));
if ((i + 1) <= size_y) {
izq_1 = A[sc_ptr_0];
Expand All @@ -228,16 +227,15 @@ void j2d_kernel_adhoc_warm(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
asm volatile("vfslide1up.vf v24, v8, %0" ::"f"(izq_1));
asm volatile("vfslide1down.vf v28, v8, %0" ::"f"(der_1));
asm volatile("vfadd.vv v16, v4, v8"); // middle - top
mtx_ptr_0 += c; // (i + 3) * c + j
mtx_ptr_0 += c; // (i + 3) * c + j
asm volatile("vfadd.vv v16, v16, v0"); // bottom
sc_ptr_0 += c; // (i + 2) * c + j - 1
sc_ptr_0 += c; // (i + 2) * c + j - 1
asm volatile("vfadd.vv v16, v16, v24"); // left
if ((i + 2) <= size_y) {
asm volatile(
"vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_0])); // v4 middle
asm volatile("vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_0])); // v4 middle
}
asm volatile("vfadd.vv v16, v16, v28"); // right
sc_ptr_1 += c; // (i + 2) * c + j + gvl
sc_ptr_1 += c; // (i + 2) * c + j + gvl
asm volatile("vfmul.vf v16, v16, %0" ::"f"(five_));
if ((i + 2) <= size_y) {
izq_2 = A[sc_ptr_0];
Expand All @@ -255,16 +253,15 @@ void j2d_kernel_adhoc_warm(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
asm volatile("vfslide1up.vf v24, v0, %0" ::"f"(izq_2));
asm volatile("vfslide1down.vf v28, v0, %0" ::"f"(der_2));
asm volatile("vfadd.vv v20, v0, v8"); // middle - top
mtx_ptr_0 += c; // (i + 4) * c + j
mtx_ptr_0 += c; // (i + 4) * c + j
asm volatile("vfadd.vv v20, v20, v4"); // bottom
sc_ptr_0 += c; // (i + 3) * c + j - 1
sc_ptr_0 += c; // (i + 3) * c + j - 1
asm volatile("vfadd.vv v20, v20, v24"); // left
if ((i + 3) <= size_y) {
asm volatile("vle64.v v8, (%0)" ::"r"(
&A[mtx_ptr_0])); // v8 bottom
asm volatile("vle64.v v8, (%0)" ::"r"(&A[mtx_ptr_0])); // v8 bottom
}
asm volatile("vfadd.vv v20, v20, v28"); // right
sc_ptr_1 += c; // (i + 3) * c + j + gvl
sc_ptr_1 += c; // (i + 3) * c + j + gvl
asm volatile("vfmul.vf v20, v20, %0" ::"f"(five_));
if ((i + 3) <= size_y) {
izq_0 = A[sc_ptr_0];
Expand Down Expand Up @@ -308,11 +305,11 @@ void j2d_kernel_asm_v(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
asm volatile("vsetvli %0, %1, e64, m4, ta, ma"
: "=r"(gvl)
: "r"(size_x - j + 1));
mtx_ptr_0 = j; // 0 * c + j
mtx_ptr_0 = j; // 0 * c + j
asm volatile("vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top
mtx_ptr_1 = j + c; // 1 * c + j
mtx_ptr_1 = j + c; // 1 * c + j
asm volatile("vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_1])); // v4 middle
mtx_ptr_0 = mtx_ptr_1 + c; // 2 * c + j
mtx_ptr_0 = mtx_ptr_1 + c; // 2 * c + j
asm volatile("vle64.v v8, (%0)" ::"r"(&A[mtx_ptr_0])); // v8 bottom

// Look ahead and load the next coefficients
Expand Down Expand Up @@ -344,16 +341,15 @@ void j2d_kernel_asm_v(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
asm volatile("vfslide1up.vf v24, v4, %0" ::"f"(izq_0));
asm volatile("vfslide1down.vf v28, v4, %0" ::"f"(der_0));
asm volatile("vfadd.vv v12, v4, v0"); // middle - top
mtx_ptr_0 += c; // (i + 2) * c + j
mtx_ptr_0 += c; // (i + 2) * c + j
asm volatile("vfadd.vv v12, v12, v8"); // bottom
sc_ptr_0 += c; // (i + 1) * c + j - 1
sc_ptr_0 += c; // (i + 1) * c + j - 1
asm volatile("vfadd.vv v12, v12, v24"); // left
if ((i + 1) <= size_y) {
asm volatile(
"vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top
asm volatile("vle64.v v0, (%0)" ::"r"(&A[mtx_ptr_0])); // v0 top
}
asm volatile("vfadd.vv v12, v12, v28"); // right
sc_ptr_1 += c; // (i + 1) * c + j + gvl
sc_ptr_1 += c; // (i + 1) * c + j + gvl
asm volatile("vfmul.vf v12, v12, %0" ::"f"(five_));
if ((i + 1) <= size_y) {
izq_1 = A[sc_ptr_0];
Expand All @@ -371,16 +367,15 @@ void j2d_kernel_asm_v(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
asm volatile("vfslide1up.vf v24, v8, %0" ::"f"(izq_1));
asm volatile("vfslide1down.vf v28, v8, %0" ::"f"(der_1));
asm volatile("vfadd.vv v16, v4, v8"); // middle - top
mtx_ptr_0 += c; // (i + 3) * c + j
mtx_ptr_0 += c; // (i + 3) * c + j
asm volatile("vfadd.vv v16, v16, v0"); // bottom
sc_ptr_0 += c; // (i + 2) * c + j - 1
sc_ptr_0 += c; // (i + 2) * c + j - 1
asm volatile("vfadd.vv v16, v16, v24"); // left
if ((i + 2) <= size_y) {
asm volatile(
"vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_0])); // v4 middle
asm volatile("vle64.v v4, (%0)" ::"r"(&A[mtx_ptr_0])); // v4 middle
}
asm volatile("vfadd.vv v16, v16, v28"); // right
sc_ptr_1 += c; // (i + 2) * c + j + gvl
sc_ptr_1 += c; // (i + 2) * c + j + gvl
asm volatile("vfmul.vf v16, v16, %0" ::"f"(five_));
if ((i + 2) <= size_y) {
izq_2 = A[sc_ptr_0];
Expand All @@ -398,16 +393,15 @@ void j2d_kernel_asm_v(uint64_t r, uint64_t c, DATA_TYPE *A, DATA_TYPE *B) {
asm volatile("vfslide1up.vf v24, v0, %0" ::"f"(izq_2));
asm volatile("vfslide1down.vf v28, v0, %0" ::"f"(der_2));
asm volatile("vfadd.vv v20, v0, v8"); // middle - top
mtx_ptr_0 += c; // (i + 4) * c + j
mtx_ptr_0 += c; // (i + 4) * c + j
asm volatile("vfadd.vv v20, v20, v4"); // bottom
sc_ptr_0 += c; // (i + 3) * c + j - 1
sc_ptr_0 += c; // (i + 3) * c + j - 1
asm volatile("vfadd.vv v20, v20, v24"); // left
if ((i + 3) <= size_y) {
asm volatile("vle64.v v8, (%0)" ::"r"(
&A[mtx_ptr_0])); // v8 bottom
asm volatile("vle64.v v8, (%0)" ::"r"(&A[mtx_ptr_0])); // v8 bottom
}
asm volatile("vfadd.vv v20, v20, v28"); // right
sc_ptr_1 += c; // (i + 3) * c + j + gvl
sc_ptr_1 += c; // (i + 3) * c + j + gvl
asm volatile("vfmul.vf v20, v20, %0" ::"f"(five_));
if ((i + 3) <= size_y) {
izq_0 = A[sc_ptr_0];
Expand Down
4 changes: 3 additions & 1 deletion apps/roi_align/kernel/roi_align.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ int64_t CropAndResizePerBox_BHWC_vec(
float *crops_data, const int crop_height, const int crop_width,
const float extrapolation_value);

void roi_align_fake_kernel_asm(float* pimage, float* crops_data, int left_x_index, int right_x_index, int b, int y, size_t depth);
void roi_align_fake_kernel_asm(float *pimage, float *crops_data,
int left_x_index, int right_x_index, int b,
int y, size_t depth);

// Normalized image
void init_image(float *vec, size_t size);
Expand Down
3 changes: 2 additions & 1 deletion apps/roi_align/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ int main() {

printf("Starting vector main kernel...\n");
start_timer();
roi_align_fake_kernel_asm(image_data, crops_data_vec, left_x_index, right_x_index, b, y, DEPTH);
roi_align_fake_kernel_asm(image_data, crops_data_vec, left_x_index,
right_x_index, b, y, DEPTH);
stop_timer();
runtime_v = get_timer();
printf("Vector benchmark complete.\n");
Expand Down

0 comments on commit d900f6a

Please sign in to comment.