-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathamd_device_functions.h
1010 lines (867 loc) · 30.6 KB
/
amd_device_functions.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/amd_hip_common.h>
#include <hip/amd_detail/device_library_decls.h>
#include <hip/amd_detail/hip_assert.h>
#include "host_defines.h"
#include "math_fwd.h"
#include <hip/hip_runtime_api.h>
#include <stddef.h>
#include <hip/hip_vector_types.h>
#endif // !defined(__HIPCC_RTC__)
#if defined(__clang__) && defined(__HIP__)
extern "C" __device__ int printf(const char *fmt, ...);
#else
template <typename... All>
static inline __device__ void printf(const char* format, All... all) {}
#endif
extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
/*
Integer Intrinsics
*/
// integer intrinsic function __poc __clz __ffs __brev
__device__ static inline unsigned int __popc(unsigned int input) {
return __builtin_popcount(input);
}
__device__ static inline unsigned int __popcll(unsigned long long int input) {
return __builtin_popcountll(input);
}
__device__ static inline int __clz(int input) {
return __ockl_clz_u32((uint)input);
}
__device__ static inline int __clzll(long long int input) {
return __ockl_clz_u64((uint64_t)input);
}
__device__ static inline int __ffs(unsigned int input) {
return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
}
__device__ static inline int __ffsll(unsigned long long int input) {
return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
}
__device__ static inline int __ffs(int input) {
return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
}
__device__ static inline int __ffsll(long long int input) {
return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
}
// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
// If not found, return -1.
__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
uint64_t temp_mask = mask;
int32_t temp_offset = offset;
if (offset == 0) {
temp_mask &= (1 << base);
temp_offset = 1;
}
else if (offset < 0) {
temp_mask = __builtin_bitreverse64(mask);
base = 63 - base;
temp_offset = -offset;
}
temp_mask = temp_mask & ((~0ULL) << base);
if (__builtin_popcountll(temp_mask) < temp_offset)
return -1;
int32_t total = 0;
for (int i = 0x20; i > 0; i >>= 1) {
uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
int32_t pcnt = __builtin_popcountll(temp_mask_lo);
if (pcnt < temp_offset) {
temp_mask = temp_mask >> i;
temp_offset -= pcnt;
total += i;
}
else {
temp_mask = temp_mask_lo;
}
}
if (offset < 0)
return 63 - total;
else
return total;
}
__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
uint64_t temp_mask = mask;
int32_t temp_offset = offset;
if (offset == 0) {
temp_mask &= (1 << base);
temp_offset = 1;
}
else if (offset < 0) {
temp_mask = __builtin_bitreverse64(mask);
base = 63 - base;
temp_offset = -offset;
}
temp_mask = temp_mask & ((~0ULL) << base);
if (__builtin_popcountll(temp_mask) < temp_offset)
return -1;
int32_t total = 0;
for (int i = 0x20; i > 0; i >>= 1) {
uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
int32_t pcnt = __builtin_popcountll(temp_mask_lo);
if (pcnt < temp_offset) {
temp_mask = temp_mask >> i;
temp_offset -= pcnt;
total += i;
}
else {
temp_mask = temp_mask_lo;
}
}
if (offset < 0)
return 63 - total;
else
return total;
}
__device__ static inline unsigned int __brev(unsigned int input) {
return __builtin_bitreverse32(input);
}
__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
return __builtin_bitreverse64(input);
}
__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
return input == 0 ? -1 : __builtin_ctzl(input);
}
__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
uint32_t offset = src1 & 31;
uint32_t width = src2 & 31;
return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
}
__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
uint64_t offset = src1 & 63;
uint64_t width = src2 & 63;
return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
}
__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
uint32_t offset = src2 & 31;
uint32_t width = src3 & 31;
uint32_t mask = (1 << width) - 1;
return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
}
__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
uint64_t offset = src2 & 63;
uint64_t width = src3 & 63;
uint64_t mask = (1ULL << width) - 1;
return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
}
__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
{
uint32_t mask_shift = shift & 31;
return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
}
__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
{
uint32_t min_shift = shift >= 32 ? 32 : shift;
return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
}
__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
{
return __builtin_amdgcn_alignbit(hi, lo, shift);
}
__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
{
return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
}
__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
__device__ static unsigned int __hadd(int x, int y);
__device__ static int __mul24(int x, int y);
__device__ static long long int __mul64hi(long long int x, long long int y);
__device__ static int __mulhi(int x, int y);
__device__ static int __rhadd(int x, int y);
__device__ static unsigned int __sad(int x, int y,unsigned int z);
__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
__device__ static int __umul24(unsigned int x, unsigned int y);
__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
struct ucharHolder {
union {
unsigned char c[4];
unsigned int ui;
};
} __attribute__((aligned(4)));
struct uchar2Holder {
union {
unsigned int ui[2];
unsigned char c[8];
};
} __attribute__((aligned(8)));
__device__
static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
struct uchar2Holder cHoldVal;
struct ucharHolder cHoldKey;
cHoldKey.ui = s;
cHoldVal.ui[0] = x;
cHoldVal.ui[1] = y;
unsigned int result;
result = cHoldVal.c[cHoldKey.c[0] & 0x07];
result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
return result;
}
__device__ static inline unsigned int __hadd(int x, int y) {
int z = x + y;
int sign = z & 0x8000000;
int value = z & 0x7FFFFFFF;
return ((value) >> 1 || sign);
}
__device__ static inline int __mul24(int x, int y) {
return __ockl_mul24_i32(x, y);
}
__device__ static inline long long __mul64hi(long long int x, long long int y) {
unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
long long x1 = x >> 32;
unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
long long y1 = y >> 32;
unsigned long long z0 = x0*y0;
long long t = x1*y0 + (z0 >> 32);
long long z1 = t & 0xffffffffL;
long long z2 = t >> 32;
z1 = x0*y1 + z1;
return x1*y1 + z2 + (z1 >> 32);
}
__device__ static inline int __mulhi(int x, int y) {
return __ockl_mul_hi_i32(x, y);
}
__device__ static inline int __rhadd(int x, int y) {
int z = x + y + 1;
int sign = z & 0x8000000;
int value = z & 0x7FFFFFFF;
return ((value) >> 1 || sign);
}
__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
return x > y ? x - y + z : y - x + z;
}
__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
return (x + y) >> 1;
}
__device__ static inline int __umul24(unsigned int x, unsigned int y) {
return __ockl_mul24_u32(x, y);
}
__device__
static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
unsigned long long x0 = x & 0xffffffffUL;
unsigned long long x1 = x >> 32;
unsigned long long y0 = y & 0xffffffffUL;
unsigned long long y1 = y >> 32;
unsigned long long z0 = x0*y0;
unsigned long long t = x1*y0 + (z0 >> 32);
unsigned long long z1 = t & 0xffffffffUL;
unsigned long long z2 = t >> 32;
z1 = x0*y1 + z1;
return x1*y1 + z2 + (z1 >> 32);
}
__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
return __ockl_mul_hi_u32(x, y);
}
__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
return (x + y + 1) >> 1;
}
__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
return __ockl_sadd_u32(x, y, z);
}
__device__
static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
__device__
static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
/*
HIP specific device functions
*/
#if !defined(__HIPCC_RTC__)
#include "amd_warp_functions.h"
#include "amd_warp_sync_functions.h"
#endif
#define MASK1 0x00ff00ff
#define MASK2 0xff00ff00
__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
char4 out;
unsigned one1 = in1.w & MASK1;
unsigned one2 = in2.w & MASK1;
out.w = (one1 + one2) & MASK1;
one1 = in1.w & MASK2;
one2 = in2.w & MASK2;
out.w = out.w | ((one1 + one2) & MASK2);
return out;
}
__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
char4 out;
unsigned one1 = in1.w & MASK1;
unsigned one2 = in2.w & MASK1;
out.w = (one1 - one2) & MASK1;
one1 = in1.w & MASK2;
one2 = in2.w & MASK2;
out.w = out.w | ((one1 - one2) & MASK2);
return out;
}
__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
char4 out;
unsigned one1 = in1.w & MASK1;
unsigned one2 = in2.w & MASK1;
out.w = (one1 * one2) & MASK1;
one1 = in1.w & MASK2;
one2 = in2.w & MASK2;
out.w = out.w | ((one1 * one2) & MASK2);
return out;
}
__device__ static inline float __double2float_rd(double x) {
return __ocml_cvtrtn_f32_f64(x);
}
__device__ static inline float __double2float_rn(double x) { return x; }
__device__ static inline float __double2float_ru(double x) {
return __ocml_cvtrtp_f32_f64(x);
}
__device__ static inline float __double2float_rz(double x) {
return __ocml_cvtrtz_f32_f64(x);
}
__device__ static inline int __double2hiint(double x) {
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[1];
}
__device__ static inline int __double2loint(double x) {
static_assert(sizeof(double) == 2 * sizeof(int), "");
int tmp[2];
__builtin_memcpy(tmp, &x, sizeof(tmp));
return tmp[0];
}
__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
__device__ static inline int __double2int_rz(double x) { return (int)x; }
__device__ static inline long long int __double2ll_rd(double x) {
return (long long)__ocml_floor_f64(x);
}
__device__ static inline long long int __double2ll_rn(double x) {
return (long long)__ocml_rint_f64(x);
}
__device__ static inline long long int __double2ll_ru(double x) {
return (long long)__ocml_ceil_f64(x);
}
__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
__device__ static inline unsigned int __double2uint_rd(double x) {
return (unsigned int)__ocml_floor_f64(x);
}
__device__ static inline unsigned int __double2uint_rn(double x) {
return (unsigned int)__ocml_rint_f64(x);
}
__device__ static inline unsigned int __double2uint_ru(double x) {
return (unsigned int)__ocml_ceil_f64(x);
}
__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
__device__ static inline unsigned long long int __double2ull_rd(double x) {
return (unsigned long long int)__ocml_floor_f64(x);
}
__device__ static inline unsigned long long int __double2ull_rn(double x) {
return (unsigned long long int)__ocml_rint_f64(x);
}
__device__ static inline unsigned long long int __double2ull_ru(double x) {
return (unsigned long long int)__ocml_ceil_f64(x);
}
__device__ static inline unsigned long long int __double2ull_rz(double x) {
return (unsigned long long int)x;
}
__device__ static inline long long int __double_as_longlong(double x) {
static_assert(sizeof(long long) == sizeof(double), "");
long long tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
/*
__device__ unsigned short __float2half_rn(float x);
__device__ float __half2float(unsigned short);
The above device function are not a valid .
Use
__device__ __half __float2half_rn(float x);
__device__ float __half2float(__half);
from hip_fp16.h
CUDA implements half as unsigned short whereas, HIP doesn't.
*/
__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
__device__ static inline long long int __float2ll_rd(float x) {
return (long long int)__ocml_floor_f32(x);
}
__device__ static inline long long int __float2ll_rn(float x) {
return (long long int)__ocml_rint_f32(x);
}
__device__ static inline long long int __float2ll_ru(float x) {
return (long long int)__ocml_ceil_f32(x);
}
__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
__device__ static inline unsigned int __float2uint_rd(float x) {
return (unsigned int)__ocml_floor_f32(x);
}
__device__ static inline unsigned int __float2uint_rn(float x) {
return (unsigned int)__ocml_rint_f32(x);
}
__device__ static inline unsigned int __float2uint_ru(float x) {
return (unsigned int)__ocml_ceil_f32(x);
}
__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
__device__ static inline unsigned long long int __float2ull_rd(float x) {
return (unsigned long long int)__ocml_floor_f32(x);
}
__device__ static inline unsigned long long int __float2ull_rn(float x) {
return (unsigned long long int)__ocml_rint_f32(x);
}
__device__ static inline unsigned long long int __float2ull_ru(float x) {
return (unsigned long long int)__ocml_ceil_f32(x);
}
__device__ static inline unsigned long long int __float2ull_rz(float x) {
return (unsigned long long int)x;
}
__device__ static inline int __float_as_int(float x) {
static_assert(sizeof(int) == sizeof(float), "");
int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ static inline unsigned int __float_as_uint(float x) {
static_assert(sizeof(unsigned int) == sizeof(float), "");
unsigned int tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ static inline double __hiloint2double(int hi, int lo) {
static_assert(sizeof(double) == sizeof(uint64_t), "");
uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
double tmp1;
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
return tmp1;
}
__device__ static inline double __int2double_rn(int x) { return (double)x; }
__device__ static inline float __int2float_rd(int x) {
return __ocml_cvtrtn_f32_s32(x);
}
__device__ static inline float __int2float_rn(int x) { return (float)x; }
__device__ static inline float __int2float_ru(int x) {
return __ocml_cvtrtp_f32_s32(x);
}
__device__ static inline float __int2float_rz(int x) {
return __ocml_cvtrtz_f32_s32(x);
}
__device__ static inline float __int_as_float(int x) {
static_assert(sizeof(float) == sizeof(int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ static inline double __ll2double_rd(long long int x) {
return __ocml_cvtrtn_f64_s64(x);
}
__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
__device__ static inline double __ll2double_ru(long long int x) {
return __ocml_cvtrtp_f64_s64(x);
}
__device__ static inline double __ll2double_rz(long long int x) {
return __ocml_cvtrtz_f64_s64(x);
}
__device__ static inline float __ll2float_rd(long long int x) {
return __ocml_cvtrtn_f32_s64(x);
}
__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
__device__ static inline float __ll2float_ru(long long int x) {
return __ocml_cvtrtp_f32_s64(x);
}
__device__ static inline float __ll2float_rz(long long int x) {
return __ocml_cvtrtz_f32_s64(x);
}
__device__ static inline double __longlong_as_double(long long int x) {
static_assert(sizeof(double) == sizeof(long long), "");
double tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
__device__ static inline float __uint2float_rd(unsigned int x) {
return __ocml_cvtrtn_f32_u32(x);
}
__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
__device__ static inline float __uint2float_ru(unsigned int x) {
return __ocml_cvtrtp_f32_u32(x);
}
__device__ static inline float __uint2float_rz(unsigned int x) {
return __ocml_cvtrtz_f32_u32(x);
}
__device__ static inline float __uint_as_float(unsigned int x) {
static_assert(sizeof(float) == sizeof(unsigned int), "");
float tmp;
__builtin_memcpy(&tmp, &x, sizeof(tmp));
return tmp;
}
__device__ static inline double __ull2double_rd(unsigned long long int x) {
return __ocml_cvtrtn_f64_u64(x);
}
__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
__device__ static inline double __ull2double_ru(unsigned long long int x) {
return __ocml_cvtrtp_f64_u64(x);
}
__device__ static inline double __ull2double_rz(unsigned long long int x) {
return __ocml_cvtrtz_f64_u64(x);
}
__device__ static inline float __ull2float_rd(unsigned long long int x) {
return __ocml_cvtrtn_f32_u64(x);
}
__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
__device__ static inline float __ull2float_ru(unsigned long long int x) {
return __ocml_cvtrtp_f32_u64(x);
}
__device__ static inline float __ull2float_rz(unsigned long long int x) {
return __ocml_cvtrtz_f32_u64(x);
}
#if defined(__clang__) && defined(__HIP__)
// Clock functions
__device__ long long int __clock64();
__device__ long long int __clock();
__device__ long long int clock64();
__device__ long long int clock();
__device__ long long int wall_clock64();
// hip.amdgcn.bc - named sync
__device__ void __named_sync();
#ifdef __HIP_DEVICE_COMPILE__
// Clock function to return GPU core cycle count.
// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
// through hipDeviceAttributeClockRate attribute.
__device__
inline __attribute((always_inline))
long long int __clock64() {
#if __has_builtin(__builtin_amdgcn_s_memtime)
// Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
return (long long int) __builtin_amdgcn_s_memtime();
#else
// Subject to change when better solution available
return (long long int) __builtin_readcyclecounter();
#endif
}
__device__
inline __attribute((always_inline))
long long int __clock() { return __clock64(); }
// Clock function to return wall clock count at a constant frequency that can be queried
// through hipDeviceAttributeWallClockRate attribute.
__device__
inline __attribute__((always_inline))
long long int wall_clock64() {
return (long long int) __ockl_steadyctr_u64();
}
__device__
inline __attribute__((always_inline))
long long int clock64() { return __clock64(); }
__device__
inline __attribute__((always_inline))
long long int clock() { return __clock(); }
// hip.amdgcn.bc - named sync
__device__
inline
void __named_sync() { __builtin_amdgcn_s_barrier(); }
#endif // __HIP_DEVICE_COMPILE__
// hip.amdgcn.bc - lanemask
__device__
inline
uint64_t __lanemask_gt()
{
uint32_t lane = __ockl_lane_u32();
if (lane == 63)
return 0;
uint64_t ballot = __ballot64(1);
uint64_t mask = (~((uint64_t)0)) << (lane + 1);
return mask & ballot;
}
__device__
inline
uint64_t __lanemask_lt()
{
uint32_t lane = __ockl_lane_u32();
int64_t ballot = __ballot64(1);
uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
return mask & ballot;
}
__device__
inline
uint64_t __lanemask_eq()
{
uint32_t lane = __ockl_lane_u32();
int64_t mask = ((uint64_t)1 << lane);
return mask;
}
__device__ inline void* __local_to_generic(void* p) { return p; }
#ifdef __HIP_DEVICE_COMPILE__
__device__
inline
void* __get_dynamicgroupbaseptr()
{
// Get group segment base pointer.
return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
}
#else
__device__
void* __get_dynamicgroupbaseptr();
#endif // __HIP_DEVICE_COMPILE__
__device__
inline
void *__amdgcn_get_dynamicgroupbaseptr() {
return __get_dynamicgroupbaseptr();
}
// Memory Fence Functions
__device__
inline
static void __threadfence()
{
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
}
__device__
inline
static void __threadfence_block()
{
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
}
__device__
inline
static void __threadfence_system()
{
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
}
__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
if (flags) {
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
__builtin_amdgcn_s_barrier();
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
} else {
__builtin_amdgcn_s_barrier();
}
}
__device__
inline
static void __barrier(int n)
{
__work_group_barrier((__cl_mem_fence_flags)n);
}
__device__
inline
__attribute__((convergent))
void __syncthreads()
{
__barrier(__CLK_LOCAL_MEM_FENCE);
}
__device__
inline
__attribute__((convergent))
int __syncthreads_count(int predicate)
{
return __ockl_wgred_add_i32(!!predicate);
}
__device__
inline
__attribute__((convergent))
int __syncthreads_and(int predicate)
{
return __ockl_wgred_and_i32(!!predicate);
}
__device__
inline
__attribute__((convergent))
int __syncthreads_or(int predicate)
{
return __ockl_wgred_or_i32(!!predicate);
}
// hip.amdgcn.bc - device routine
/*
HW_ID Register bit structure for RDNA2 & RDNA3
WAVE_ID 4:0 Wave id within the SIMD.
SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
WGP_ID 13:10 Physical WGP ID.
SA_ID 16 Shader Array ID
SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
DP_RATE 31:29 Number of double-precision float units per SIMD
HW_ID Register bit structure for GCN and CDNA
WAVE_ID 3:0 Wave buffer slot number. 0-9.
SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
PIPE_ID 7:6 Pipeline from which the wave was dispatched.
CU_ID 11:8 Compute Unit the wave is assigned to.
SH_ID 12 Shader Array (within an SE) the wave is assigned to.
SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a
14:13 Shader Engine the wave is assigned to for gfx940-942
TG_ID 19:16 Thread-group ID
VM_ID 23:20 Virtual Memory ID
QUEUE_ID 26:24 Queue from which this wave was dispatched.
STATE_ID 29:27 State ID (graphics only, not compute).
ME_ID 31:30 Micro-engine ID.
XCC_ID Register bit structure for gfx940
XCC_ID 3:0 XCC the wave is assigned to.
*/
#if (defined (__GFX10__) || defined (__GFX11__))
#define HW_ID 23
#else
#define HW_ID 4
#endif
#if (defined(__GFX10__) || defined(__GFX11__))
#define HW_ID_WGP_ID_SIZE 4
#define HW_ID_WGP_ID_OFFSET 10
#if (defined(__AMDGCN_CUMODE__))
#define HW_ID_CU_ID_SIZE 1
#define HW_ID_CU_ID_OFFSET 8
#endif
#else
#define HW_ID_CU_ID_SIZE 4
#define HW_ID_CU_ID_OFFSET 8
#endif
#if (defined(__gfx908__) || defined(__gfx90a__) || \
defined(__GFX11__))
#define HW_ID_SE_ID_SIZE 3
#else //4 SEs/XCC for gfx940-942
#define HW_ID_SE_ID_SIZE 2
#endif
#if (defined(__GFX10__) || defined(__GFX11__))
#define HW_ID_SE_ID_OFFSET 18
#define HW_ID_SA_ID_OFFSET 16
#define HW_ID_SA_ID_SIZE 1
#else
#define HW_ID_SE_ID_OFFSET 13
#endif
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
#define XCC_ID 20
#define XCC_ID_XCC_ID_SIZE 4
#define XCC_ID_XCC_ID_OFFSET 0
#endif
#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
#define __HIP_NO_IMAGE_SUPPORT 1
#endif
/*
Encoding of parameter bitmask
HW_ID 5:0 HW_ID
OFFSET 10:6 Range: 0..31
SIZE 15:11 Range: 1..32
*/
#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
/*
__smid returns the wave's assigned Compute Unit and Shader Engine.
The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
Note: the results vary over time.
SZ minus 1 since SIZE is 1-based.
*/
__device__
inline
unsigned __smid(void)
{
unsigned se_id = __builtin_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
#if (defined(__GFX10__) || defined(__GFX11__))
unsigned wgp_id = __builtin_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
unsigned sa_id = __builtin_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
#if (defined(__AMDGCN_CUMODE__))
unsigned cu_id = __builtin_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
#endif
#else
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
unsigned xcc_id = __builtin_amdgcn_s_getreg(
GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
#endif
unsigned cu_id = __builtin_amdgcn_s_getreg(
GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
#endif
#if (defined(__GFX10__) || defined(__GFX11__))
unsigned temp = se_id;
temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
#if (defined(__AMDGCN_CUMODE__))
temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
#endif
return temp;
//TODO : CU Mode impl
#elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
unsigned temp = xcc_id;
temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
return temp;
#else
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
#endif
}
/**
* Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
* To be removed in a future release.
*/
#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
#endif //defined(__clang__) && defined(__HIP__)
// loop unrolling
static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
auto dstPtr = static_cast<unsigned char*>(dst);
auto srcPtr = static_cast<const unsigned char*>(src);
while (size >= 4u) {
dstPtr[0] = srcPtr[0];
dstPtr[1] = srcPtr[1];
dstPtr[2] = srcPtr[2];
dstPtr[3] = srcPtr[3];
size -= 4u;
srcPtr += 4u;
dstPtr += 4u;
}
switch (size) {
case 3:
dstPtr[2] = srcPtr[2];
case 2:
dstPtr[1] = srcPtr[1];
case 1:
dstPtr[0] = srcPtr[0];
}
return dst;
}
static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
auto dstPtr = static_cast<unsigned char*>(dst);
while (size >= 4u) {
dstPtr[0] = val;
dstPtr[1] = val;
dstPtr[2] = val;
dstPtr[3] = val;
size -= 4u;
dstPtr += 4u;
}
switch (size) {
case 3:
dstPtr[2] = val;
case 2:
dstPtr[1] = val;
case 1:
dstPtr[0] = val;
}
return dst;
}
#ifndef __OPENMP_AMDGCN__
static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {