Optimize Min/Max paths with AVX10.2 intrinsics #112535

khushal1996 · 2025-02-13T20:07:10Z

Overview

This PR tracks optimizing x64 min/max floating point using the new saturating instructions introduced in AVX10.2. We are following the spec doc to add the new instructions and optimize the x64/x86 conversions.

Addresses #109081

Testing

Step 1: Run superpmi.exe on library mch files using JITLateDisasm to check if any errors occur. Use JITLateDisasm to check for a valid decoding of the byte stream through LLVM disasmbler

For this step, a new coredistools was used built from the LLVM repo. After running superpmi with JITLateDisasm, no decoding failures were detected. Please contact for getting access to the superpmi logs.

Step 2: Run superpmi and check for asmdiffs and assert errors.

Below is the summary of superpmi run

Top file improvements (bytes):
         -62 : 9311.dasm (-12.47% of base)

1 total files with Code Size differences (1 improved, 0 regressed), 0 unchanged.

Top method improvements (bytes):
         -62 (-12.47% of base) : 9311.dasm - System.Threading.ProcessorIdCache:ProcessorNumberSpeedCheck():ubyte (FullOpts)

Top method improvements (percentages):
         -62 (-12.47% of base) : 9311.dasm - System.Threading.ProcessorIdCache:ProcessorNumberSpeedCheck():ubyte (FullOpts)

1 total methods with Code Size differences (1 improved, 0 regressed).

diff
@@ -7,8 +7,8 @@
 ; partially interruptible
 ; Final local variable assignments
 ;
-;  V00 loc0         [V00,T18] (  4,  9.50)  double  ->  mm7        
-;  V01 loc1         [V01,T19] (  4,  9.50)  double  ->  mm6        
+;  V00 loc0         [V00,T13] (  4,  9.50)  double  ->  mm7        
+;  V01 loc1         [V01,T14] (  4,  9.50)  double  ->  mm6        
 ;  V02 loc2         [V02,T04] (  3, 64.25)    long  ->  rbx        
 ;* V03 loc3         [V03,T12] (  0,  0   )     int  ->  zero-ref   
 ;  V04 loc4         [V04,T00] ( 10,264   )    long  ->  rbp        
@@ -16,23 +16,18 @@
 ;* V06 loc6         [V06,T05] (  0,  0   )     int  ->  zero-ref   
 ;* V07 loc7         [V07,T06] (  0,  0   )     int  ->  zero-ref   
 ;  V08 OutArgs      [V08    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V09 tmp1         [V09,T13] (  3, 24   )  simd16  ->  mm0         "Cloning op2 for Math.Max/Min"
-;  V10 tmp2         [V10,T14] (  3, 24   )  simd16  ->  mm7         "Cloning op1 for Math.Max/Min"
-;  V11 tmp3         [V11,T15] (  3, 24   )  simd16  ->  mm0         "Cloning op2 for Math.Max/Min"
-;  V12 tmp4         [V12,T16] (  3, 24   )  simd16  ->  mm6         "Cloning op1 for Math.Max/Min"
-;  V13 tmp5         [V13,T11] (  2,  1   )     int  ->  rax         "Inline return value spill temp"
-;  V14 tmp6         [V14,T08] (  3,  3   )     int  ->  rax         "Inlining Arg"
-;  V15 cse0         [V15,T17] (  5, 16.25)  simd16  ->  mm8         hoist "CSE #02: aggressive"
-;  V16 cse1         [V16,T20] (  3,  3   )  double  ->  mm6         "CSE #01: aggressive"
-;  V17 rat0         [V17,T07] (  4, 12.25)     int  ->  rsi         "Trip count IV"
-;  V18 rat1         [V18,T02] (  4,196   )     int  ->  r14         "Trip count IV"
-;  V19 rat2         [V19,T03] (  4,196   )     int  ->  r14         "Trip count IV"
-;  V20 rat3         [V20,T09] (  3,  1.50)    long  ->  rbx         "fgMakeTemp is creating a new local variable"
-;  V21 rat4         [V21,T10] (  3,  1.50)    long  ->  rdx         "ReplaceWithLclVar is creating a new local variable"
-;  V22 rat5         [V22,T21] (  3,  3   )  double  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
-;  V23 rat6         [V23,T22] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
+;  V09 tmp1         [V09,T11] (  2,  1   )     int  ->  rax         "Inline return value spill temp"
+;  V10 tmp2         [V10,T08] (  3,  3   )     int  ->  rax         "Inlining Arg"
+;  V11 cse0         [V11,T15] (  3,  3   )  double  ->  mm6         "CSE #01: aggressive"
+;  V12 rat0         [V12,T07] (  4, 12.25)     int  ->  rsi         "Trip count IV"
+;  V13 rat1         [V13,T02] (  4,196   )     int  ->  r14         "Trip count IV"
+;  V14 rat2         [V14,T03] (  4,196   )     int  ->  r14         "Trip count IV"
+;  V15 rat3         [V15,T09] (  3,  1.50)    long  ->  rbx         "fgMakeTemp is creating a new local variable"
+;  V16 rat4         [V16,T10] (  3,  1.50)    long  ->  rdx         "ReplaceWithLclVar is creating a new local variable"
+;  V17 rat5         [V17,T16] (  3,  3   )  double  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
+;  V18 rat6         [V18,T17] (  3,  3   )  simd16  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
-; Lcl frame size = 80
+; Lcl frame size = 64
 
 G_M1452_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
        push     r14
@@ -40,11 +35,10 @@ G_M1452_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
        push     rsi
        push     rbp
        push     rbx
-       sub      rsp, 80
-       vmovaps  xmmword ptr [rsp+0x40], xmm6
-       vmovaps  xmmword ptr [rsp+0x30], xmm7
-       vmovaps  xmmword ptr [rsp+0x20], xmm8
-						;; size=28 bbWeight=1 PerfScore 11.25
+       sub      rsp, 64
+       vmovaps  xmmword ptr [rsp+0x30], xmm6
+       vmovaps  xmmword ptr [rsp+0x20], xmm7
+						;; size=22 bbWeight=1 PerfScore 9.25
 G_M1452_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vmovsd   xmm6, qword ptr [reloc @RWD00]
        vmovaps  xmm7, xmm6
@@ -68,9 +62,8 @@ G_M1452_IG04:        ; bbWeight=0.25, gcrefRegs=0000 {}, byrefRegs=0000 {}, byre
        shr      rax, 63
        sar      rdx, 18
        lea      rbx, [rdx+rax+0x01]
-       vmovups  xmm8, xmmword ptr [reloc @RWD16]
        mov      esi, 10
-						;; size=45 bbWeight=0.25 PerfScore 3.00
+						;; size=37 bbWeight=0.25 PerfScore 2.25
 G_M1452_IG05:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        mov      edi, 8
 						;; size=5 bbWeight=4 PerfScore 1.00
@@ -105,17 +98,14 @@ G_M1452_IG10:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vxorps   xmm1, xmm1, xmm1
        vcvtsi2sd xmm1, xmm1, edi
        vdivsd   xmm0, xmm0, xmm1
-       vrangesd xmm1, xmm7, xmm0, 4
-       vfixupimmsd xmm7, xmm0, xmm8, 0
-       vfixupimmsd xmm1, xmm7, xmm8, 0
-       vmovaps  xmm7, xmm1
+       vminmaxsd xmm7, xmm7, xmm0, 4
        mov      eax, edi
        sar      eax, 31
        and      eax, 3
        add      eax, edi
        mov      edi, eax
        sar      edi, 2
-						;; size=61 bbWeight=4 PerfScore 143.67
+						;; size=43 bbWeight=4 PerfScore 118.67
 G_M1452_IG11:        ; bbWeight=32, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
        add      edi, edi
        call     System.Diagnostics.Stopwatch:QueryPerformanceCounter():long
@@ -147,21 +137,18 @@ G_M1452_IG15:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        vxorps   xmm1, xmm1, xmm1
        vcvtsi2sd xmm1, xmm1, edi
        vdivsd   xmm0, xmm0, xmm1
-       vrangesd xmm1, xmm6, xmm0, 4
-       vfixupimmsd xmm6, xmm0, xmm8, 0
-       vfixupimmsd xmm1, xmm6, xmm8, 0
-       vmovaps  xmm6, xmm1
+       vminmaxsd xmm6, xmm6, xmm0, 4
        dec      esi
        jne      G_M1452_IG05
-						;; size=54 bbWeight=4 PerfScore 140.67
+						;; size=36 bbWeight=4 PerfScore 115.67
 G_M1452_IG16:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
-       vmulsd   xmm0, xmm7, qword ptr [reloc @RWD32]
+       vmulsd   xmm0, xmm7, qword ptr [reloc @RWD08]
        vdivsd   xmm0, xmm0, xmm6
-       vfixupimmsd xmm0, xmm0, qword ptr [reloc @RWD48], 0
-       vcmppd   k1, xmm0, xmmword ptr [reloc @RWD64], 13
+       vfixupimmsd xmm0, xmm0, qword ptr [reloc @RWD16], 0
+       vcmppd   k1, xmm0, xmmword ptr [reloc @RWD32], 13
        vcvttsd2si eax, xmm0
        vpbroadcastd xmm0, eax
-       vpblendmd xmm0 {k1}, xmm0, dword ptr [reloc @RWD80] {1to4}
+       vpblendmd xmm0 {k1}, xmm0, dword ptr [reloc @RWD48] {1to4}
        vmovd    eax, xmm0
        mov      ecx, 0x1388
        cmp      eax, 0x1388
@@ -172,49 +159,44 @@ G_M1452_IG16:        ; bbWeight=0.50, gcrefRegs=0000 {}, byrefRegs=0000 {}, byre
        movzx    rax, al
 						;; size=90 bbWeight=0.50 PerfScore 21.00
 G_M1452_IG17:        ; bbWeight=0.50, epilog, nogc, extend
-       vmovaps  xmm6, xmmword ptr [rsp+0x40]
-       vmovaps  xmm7, xmmword ptr [rsp+0x30]
-       vmovaps  xmm8, xmmword ptr [rsp+0x20]
-       add      rsp, 80
+       vmovaps  xmm6, xmmword ptr [rsp+0x30]
+       vmovaps  xmm7, xmmword ptr [rsp+0x20]
+       add      rsp, 64
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
        pop      r14
        ret      
-						;; size=29 bbWeight=0.50 PerfScore 7.88
+						;; size=23 bbWeight=0.50 PerfScore 5.88
 G_M1452_IG18:        ; bbWeight=0.50, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
        mov      dword ptr [(reloc)], 0xFFFF      ; static handle
        xor      eax, eax
 						;; size=12 bbWeight=0.50 PerfScore 0.62
 G_M1452_IG19:        ; bbWeight=0.50, epilog, nogc, extend
-       vmovaps  xmm6, xmmword ptr [rsp+0x40]
-       vmovaps  xmm7, xmmword ptr [rsp+0x30]
-       vmovaps  xmm8, xmmword ptr [rsp+0x20]
-       add      rsp, 80
+       vmovaps  xmm6, xmmword ptr [rsp+0x30]
+       vmovaps  xmm7, xmmword ptr [rsp+0x20]
+       add      rsp, 64
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
        pop      r14
        ret      
-						;; size=29 bbWeight=0.50 PerfScore 7.88
+						;; size=23 bbWeight=0.50 PerfScore 5.88
 G_M1452_IG20:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
        call     CORINFO_HELP_READYTORUN_NONGCSTATIC_BASE
        ; gcr arg pop 0
        jmp      G_M1452_IG04
 						;; size=10 bbWeight=0 PerfScore 0.00
 RWD00  	dq	7FEFFFFFFFFFFFFFh	; 1.79769313e+308
-RWD08  	dd	00000000h, 00000000h
-RWD16  	dq	0000000000000001h, 0000000000000000h
-RWD32  	dq	4014000000000000h	;            5
-RWD40  	dd	00000000h, 00000000h
-RWD48  	dq	0000000000000088h, 0000000000000000h
-RWD64  	dq	41DFFFFFFFC00000h, 41DFFFFFFFC00000h
-RWD80  	dd	7FFFFFFFh
+RWD08  	dq	4014000000000000h	;            5
+RWD16  	dq	0000000000000088h, 0000000000000000h
+RWD32  	dq	41DFFFFFFFC00000h, 41DFFFFFFFC00000h
+RWD48  	dd	7FFFFFFFh

Since these diffs are expected, we can conclude that the superpmi run is successful

Step 3: Run the JIT test suite using a stable subset of tests on SDE

Results

Optimized ASM

Note: Below is a case by case basis of comparison between asm generated for Avx512 vs Avx10.2. The Avx10v2 asm has been collected in sde.