-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmain.cpp
264 lines (218 loc) · 9.32 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
// This was written on PC using VC++, so I'm using SSE2 for the 16-byte copies
// here; on ARM targets you would use NEON instead.
#include <emmintrin.h>
typedef __m128i pixel4_t;
static inline pixel4_t loadpixel4u(const uint32_t *src)
{
// unaligned load!
return _mm_loadu_si128((const __m128i *) src);
}
static void storepixel4(uint32_t *dest, pixel4_t vals)
{
// aligned store
_mm_store_si128((__m128i *) dest, vals);
}
// Pixel order with supertiling:
// https://github.com/laanwj/etna_viv/blob/master/doc/hardware.md#texture-tiling
//
// (bit index) c b a 9 8 7 6 5 4 3 2 1 0
// index = tile_idx:y5y4x5x4x3y3y2x2y1y0x1x0
// = index_x_tile | index_y
//
// with
//
// (bit index) c b a 9 8 7 6 5 4 3 2 1 0
// index_x_tile = tile_idx:____x5x4x3____x2____x1x0
// index_y = _________y5y4______y3y2__y1y0____
// width of the "outermost" tile (i.e. supertile in this case)
static uintptr_t swizzle_outermost_tile_w()
{
return 64;
}
// height of the "outermost" tile
static uintptr_t swizzle_outermost_tile_h()
{
return 64;
}
static uintptr_t swizzle_x_tile(uintptr_t x)
{
return ((x & 0x03) << 0) | ((x & 0x04) << 2) | ((x & 0x38) << 4) | ((x & ~0x3f) << 6);
}
static uintptr_t swizzle_y(uintptr_t y)
{
return ((y & 0x03) << 2) | ((y & 0x0c) << 3) | ((y & 0x30) << 6);
}
// "align" must be a power of 2.
static uintptr_t align_down(uintptr_t x, uintptr_t align)
{
return x & ~(align - 1);
}
static uintptr_t align_up(uintptr_t x, uintptr_t align)
{
return (x + align-1) & ~(align - 1);
}
// Swizzle sw*sh pixels worth of 32bpp texel data from "src" to "dest" at position (dx,dy).
// "dest" is a 2D texture of dw*dh pixels. Finally, "spitch" is the distance between rows
// in the source image (in units of 32bpp texels).
static void swizzle_32bpp_small(uint32_t *dest, uint32_t dx, uint32_t dy, uint32_t dw, uint32_t dh,
const uint32_t *src, uint32_t sw, uint32_t sh, uint32_t spitch)
{
// Note that this function doesn't presume anything about the details of the tiling scheme,
// other than assuming that the outermost tiles are stored in row-major order.
//
// In other words, you can make this produce a different tiling scheme just by changing
// the swizzle_* functions above.
uintptr_t x_mask = swizzle_x_tile(~0u);
uintptr_t y_mask = swizzle_y(~0u);
uintptr_t incr_y = swizzle_x_tile(align_up(dw, swizzle_outermost_tile_w()));
uintptr_t offs_x0_tile = swizzle_x_tile(dx) + incr_y * (dy / swizzle_outermost_tile_h());
uintptr_t offs_y = swizzle_y(dy);
ptrdiff_t x_mask_bytes = x_mask * sizeof(uint32_t);
for (uint32_t y = 0; y < sh; y++) {
char *dest_line = (char *) (dest + offs_y);
uintptr_t offs_x = offs_x0_tile * sizeof(uint32_t); // convert to bytes
for (uint32_t x = 0; x < sw; x++) {
*((uint32_t *) (dest_line + offs_x)) = src[x];
offs_x = (offs_x - x_mask_bytes) & x_mask_bytes;
}
// advance pointers
src += spitch;
offs_y = (offs_y - y_mask) & y_mask;
if (!offs_y) // wrapped into next tile row
offs_x0_tile += incr_y;
}
}
void swizzle_32bpp(uint32_t *dest, uint32_t dx, uint32_t dy, uint32_t dw, uint32_t dh,
const uint32_t *src, uint32_t sw, uint32_t sh, uint32_t spitch)
{
// Innermost loop processes 4x4 pixel tiles at a time.
static const uint32_t inner_tile_w = 4;
static const uint32_t inner_tile_h = 4;
// At 32 bits/pixel, this works out to 64 bytes - a full cache line.
// Depending on the final implementation, you might want to process slightly larger
// blocks (e.g. 8x4 or 8x8) at a time to amortize overhead better.
//
// The innermost loop here knows about the tile layout; the rest of the code is
// fully generic and only uses the swizzle_* functions to determine the layout!
uint32_t dx0 = dx;
uint32_t dx1 = align_up(dx, inner_tile_w);
uint32_t dx2 = align_down(dx + sw, inner_tile_w);
uint32_t dx3 = dx + sw;
uint32_t dy0 = dy;
uint32_t dy1 = align_up(dy, inner_tile_h);
uint32_t dy2 = align_down(dy + sh, inner_tile_h);
uint32_t dy3 = dy + sh;
// if we don't cover any full tiles, use the small loop
if (dx2 <= dx1 || dy2 <= dy1) {
swizzle_32bpp_small(dest, dx, dy, dw, dh, src, sw, sh, spitch);
return;
}
// vertical slice [dy0,dy1)
if (dy0 < dy1) {
swizzle_32bpp_small(dest, dx0, dy0, dw, dh, src, sw, dy1 - dy0, spitch);
src += (dy1 - dy0) * spitch;
}
// vertical slice [dy1,dy2) - tile aligned in y
if (dx0 < dx1) // leftovers at left side?
swizzle_32bpp_small(dest, dx0, dy1, dw, dh, src, dx1 - dx0, dy2 - dy1, spitch);
// center part (fully tile aligned in dest)
{
ptrdiff_t x_step4_bytes = swizzle_x_tile(~0u * inner_tile_w) * sizeof(uint32_t);
uintptr_t y_step4 = swizzle_y(~0u * inner_tile_h);
uintptr_t incr_y = swizzle_x_tile(align_up(dw, swizzle_outermost_tile_w()));
uintptr_t offs_x0_tile = swizzle_x_tile(dx1) + incr_y * (dy1 / swizzle_outermost_tile_h());
uintptr_t offs_y = swizzle_y(dy1);
const uint32_t *src_line = src + (dx1 - dx0);
for (uint32_t y = dy1; y < dy2; y += inner_tile_h) {
char *dest_line = (char *) (dest + offs_y);
uintptr_t offs_x = offs_x0_tile * sizeof(uint32_t); // convert to bytes
for (uint32_t x = dx1; x < dx2; x += inner_tile_w) {
// NB for this loop you really want to make sure the generated code is
// good (or possibly hand-write assembly code); this is just for illustration.
pixel4_t row0 = loadpixel4u(src_line);
pixel4_t row1 = loadpixel4u(src_line + spitch);
pixel4_t row2 = loadpixel4u(src_line + 2*spitch);
pixel4_t row3 = loadpixel4u(src_line + 3*spitch);
uint32_t *dest_tile = (uint32_t *) (dest_line + offs_x);
storepixel4(dest_tile + 0, row0);
storepixel4(dest_tile + 4, row1);
storepixel4(dest_tile + 8, row2);
storepixel4(dest_tile + 12, row3);
offs_x = (offs_x - x_step4_bytes) & x_step4_bytes;
src_line += inner_tile_w;
}
// advance pointers
src_line += inner_tile_h * spitch - (dx2 - dx1);
offs_y = (offs_y - y_step4) & y_step4;
if (!offs_y) // wrapped into next tile row
offs_x0_tile += incr_y;
}
}
if (dx2 < dx3) // leftovers at right side?
swizzle_32bpp_small(dest, dx2, dy1, dw, dh, src + (dx2 - dx0), dx3 - dx2, dy2 - dy1, spitch);
src += (dy2 - dy1) * spitch;
// vertical slice [dy2,dy3)
if (dy2 < dy3)
swizzle_32bpp_small(dest, dx0, dy2, dw, dh, src, sw, dy3 - dy2, spitch);
}
// ---- Reference impl and test driver
static void swizzle_32bpp_ref(uint32_t *dest, uint32_t dx, uint32_t dy, uint32_t dw, uint32_t dh,
const uint32_t *src, uint32_t sw, uint32_t sh, uint32_t spitch)
{
uintptr_t tile_y_stride = swizzle_x_tile(align_up(dw, swizzle_outermost_tile_w()));
for (uint32_t y = 0; y < sh; y++) {
uintptr_t dest_y_offs = ((dy + y) / swizzle_outermost_tile_h()) * tile_y_stride + swizzle_y(dy + y);
uint32_t *dest_y = dest + dest_y_offs;
for (uint32_t x = 0; x < sw; x++)
dest_y[swizzle_x_tile(dx + x)] = src[x];
src += spitch;
}
}
int main(int argc, char **argv)
{
static const uint32_t max_w = 128;
static const uint32_t max_h = 128;
static const uint32_t n_runs = 100000;
uint32_t *linear = new uint32_t[max_w * max_h];
uint32_t *tiled_ref = (uint32_t *) _aligned_malloc(max_w * max_h * sizeof(uint32_t), 64);
uint32_t *tiled_fast = (uint32_t *) _aligned_malloc(max_w * max_h * sizeof(uint32_t), 64);
// random test texture
for (uint32_t i = 0; i < max_w * max_h; i++)
linear[i] = rand();
// do some test runs
for (uint32_t run = 0; run < n_runs; run++) {
static const size_t tiled_size = max_w * max_h * sizeof(uint32_t);
memset(tiled_ref, 0xcc, tiled_size);
memset(tiled_fast, 0xcc, tiled_size);
// determine random target rectangle
uint32_t x0 = rand() % max_w;
uint32_t y0 = rand() % max_h;
uint32_t x1 = rand() % max_w;
uint32_t y1 = rand() % max_h;
if (x0 > x1) std::swap(x0, x1);
if (y0 > y1) std::swap(y0, y1);
// determine random destination texture size that fits target rect
uint32_t dw = x1;
uint32_t dh = y1;
if (dw < max_w) dw += rand() % (max_w - dw);
if (dh < max_h) dh += rand() % (max_h - dh);
// swizzle two ways
swizzle_32bpp_ref(tiled_ref, x0, y0, dw, dh, linear, x1 - x0, y1 - y0, x1 - x0);
swizzle_32bpp(tiled_fast, x0, y0, dw, dh, linear, x1 - x0, y1 - y0, x1 - x0);
if (memcmp(tiled_ref, tiled_fast, tiled_size) != 0) {
printf("mismatch!\n");
printf("rect=(%d,%d)-(%d,%d), dw=%d dh=%d\n", x0, y0, x1, y1, dw, dh);
return 1;
}
}
delete[] linear;
_aligned_free(tiled_ref);
_aligned_free(tiled_fast);
printf("all ok!\n");
return 0;
}