-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Remove AO from a couple of SpanHelpers methods #85819
Conversation
Tagging subscribers to this area: @dotnet/area-system-memory Issue DetailsContributes to #84421 (removes a couple of methods jitted during Hello World start). I don't see a good reason for these to have [AO], R2R'd versions look good enough to me (SSE based): R2R codegen:; Assembly listing for method System.SpanHelpers:IndexOfNullCharacter(ulong):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
G_M000_IG01: ;; offset=0000H
sub rsp, 40
vzeroupper
G_M000_IG02: ;; offset=0007H
xor eax, eax
mov edx, 0x7FFFFFFF
test cl, 1
jne SHORT G_M000_IG04
G_M000_IG03: ;; offset=0013H
mov edx, ecx
neg edx
mov r8d, edx
shr r8d, 31
add edx, r8d
sar edx, 1
and rdx, 7
G_M000_IG04: ;; offset=0027H
cmp rdx, 4
jl SHORT G_M000_IG06
G_M000_IG05: ;; offset=002DH
cmp word ptr [rcx+2*rax], 0
je G_M000_IG21
cmp word ptr [rcx+2*rax+02H], 0
je G_M000_IG20
cmp word ptr [rcx+2*rax+04H], 0
je G_M000_IG19
cmp word ptr [rcx+2*rax+06H], 0
je G_M000_IG18
add rax, 4
add rdx, -4
cmp rdx, 4
jge SHORT G_M000_IG05
G_M000_IG06: ;; offset=006AH
test rdx, rdx
jle SHORT G_M000_IG08
G_M000_IG07: ;; offset=006FH
cmp word ptr [rcx+2*rax], 0
je G_M000_IG21
inc rax
dec rdx
test rdx, rdx
jg SHORT G_M000_IG07
G_M000_IG08: ;; offset=0085H
cmp rax, 0x7FFFFFFF
jge G_M000_IG22
lea rdx, [rcx+2*rax]
test dl, 31
je SHORT G_M000_IG11
G_M000_IG09: ;; offset=009AH
vxorps xmm0, xmm0, xmm0
vpcmpeqw xmm0, xmm0, xmmword ptr [rcx+2*rax]
vpmovmskb edx, xmm0
test edx, edx
jne SHORT G_M000_IG10
add rax, 8
jmp SHORT G_M000_IG11
G_M000_IG10: ;; offset=00B1H
xor ecx, ecx
tzcnt ecx, edx
shr ecx, 1
mov edx, ecx
add eax, edx
jmp G_M000_IG21
G_M000_IG11: ;; offset=00C2H
mov rdx, rax
neg rdx
add rdx, 0x7FFFFFFF
and rdx, -16
jle SHORT G_M000_IG13
G_M000_IG12: ;; offset=00D5H
vxorps ymm0, ymm0, ymm0
vpcmpeqw ymm0, ymm0, ymmword ptr [rcx+2*rax]
vpmovmskb r8d, ymm0
test r8d, r8d
jne SHORT G_M000_IG15
add rax, 16
add rdx, -16
test rdx, rdx
jg SHORT G_M000_IG12
G_M000_IG13: ;; offset=00F4H
mov r8, rax
neg r8
add r8, 0x7FFFFFFF
and r8, -8
jle SHORT G_M000_IG17
G_M000_IG14: ;; offset=0107H
vxorps xmm0, xmm0, xmm0
vpcmpeqw xmm0, xmm0, xmmword ptr [rcx+2*rax]
vpmovmskb edx, xmm0
test edx, edx
jne SHORT G_M000_IG16
add rax, 8
jmp SHORT G_M000_IG17
G_M000_IG15: ;; offset=011EH
xor edx, edx
tzcnt edx, r8d
shr edx, 1
mov ecx, edx
add eax, ecx
jmp SHORT G_M000_IG21
G_M000_IG16: ;; offset=012DH
tzcnt edx, edx
shr edx, 1
add eax, edx
jmp SHORT G_M000_IG21
G_M000_IG17: ;; offset=0137H
cmp rax, 0x7FFFFFFF
jge SHORT G_M000_IG22
mov rdx, rax
neg rdx
add rdx, 0x7FFFFFFF
jmp G_M000_IG04
G_M000_IG18: ;; offset=0151H
add eax, 3
jmp SHORT G_M000_IG21
G_M000_IG19: ;; offset=0156H
add eax, 2
jmp SHORT G_M000_IG21
G_M000_IG20: ;; offset=015BH
inc eax
G_M000_IG21: ;; offset=015DH
vzeroupper
add rsp, 40
ret
G_M000_IG22: ;; offset=0165H
call [System.SpanHelpers:ThrowMustBeNullTerminatedString()]
int3
; Total bytes of code 364
; Assembly listing for method System.SpanHelpers:IndexOfNullByte(ulong):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
G_M000_IG01: ;; offset=0000H
sub rsp, 40
vzeroupper
G_M000_IG02: ;; offset=0007H
xor eax, eax
mov edx, ecx
and edx, 15
neg edx
add edx, 16
and edx, 15
G_M000_IG03: ;; offset=0016H
cmp rdx, 8
jb SHORT G_M000_IG05
G_M000_IG04: ;; offset=001CH
add rdx, -8
cmp byte ptr [rcx+rax], 0
je G_M000_IG19
cmp byte ptr [rcx+rax+01H], 0
je G_M000_IG20
cmp byte ptr [rcx+rax+02H], 0
je G_M000_IG21
cmp byte ptr [rcx+rax+03H], 0
je G_M000_IG22
cmp byte ptr [rcx+rax+04H], 0
je G_M000_IG23
cmp byte ptr [rcx+rax+05H], 0
je G_M000_IG24
cmp byte ptr [rcx+rax+06H], 0
je G_M000_IG25
cmp byte ptr [rcx+rax+07H], 0
je G_M000_IG26
add rax, 8
cmp rdx, 8
jae SHORT G_M000_IG04
G_M000_IG05: ;; offset=0081H
cmp rdx, 4
jb SHORT G_M000_IG07
G_M000_IG06: ;; offset=0087H
add rdx, -4
cmp byte ptr [rcx+rax], 0
je G_M000_IG19
cmp byte ptr [rcx+rax+01H], 0
je G_M000_IG20
cmp byte ptr [rcx+rax+02H], 0
je G_M000_IG21
cmp byte ptr [rcx+rax+03H], 0
je G_M000_IG22
add rax, 4
G_M000_IG07: ;; offset=00BAH
test rdx, rdx
je SHORT G_M000_IG09
G_M000_IG08: ;; offset=00BFH
dec rdx
cmp byte ptr [rcx+rax], 0
je G_M000_IG19
inc rax
test rdx, rdx
jne SHORT G_M000_IG08
G_M000_IG09: ;; offset=00D4H
cmp rax, 0x7FFFFFFF
jae G_M000_IG28
mov edx, ecx
add rdx, rax
test dl, 31
je SHORT G_M000_IG11
G_M000_IG10: ;; offset=00EAH
vxorps xmm0, xmm0, xmm0
vpcmpeqb xmm0, xmm0, xmmword ptr [rcx+rax]
vpmovmskb edx, xmm0
test edx, edx
jne SHORT G_M000_IG16
add rax, 16
G_M000_IG11: ;; offset=00FFH
mov edx, eax
neg edx
add edx, 0x7FFFFFFF
and edx, -32
cmp rdx, rax
jbe SHORT G_M000_IG13
G_M000_IG12: ;; offset=0111H
vxorps ymm0, ymm0, ymm0
vpcmpeqb ymm0, ymm0, ymmword ptr [rcx+rax]
vpmovmskb r8d, ymm0
test r8d, r8d
jne SHORT G_M000_IG17
add rax, 32
cmp rdx, rax
ja SHORT G_M000_IG12
G_M000_IG13: ;; offset=012CH
mov edx, eax
neg edx
add edx, 0x7FFFFFFF
and edx, -16
mov r8d, edx
cmp r8, rax
jbe SHORT G_M000_IG15
G_M000_IG14: ;; offset=0141H
vxorps xmm0, xmm0, xmm0
vpcmpeqb xmm0, xmm0, xmmword ptr [rcx+rax]
vpmovmskb edx, xmm0
test edx, edx
jne SHORT G_M000_IG18
add rax, 16
G_M000_IG15: ;; offset=0156H
cmp rax, 0x7FFFFFFF
jae SHORT G_M000_IG28
mov rdx, rax
neg rdx
add rdx, 0x7FFFFFFF
jmp G_M000_IG03
G_M000_IG16: ;; offset=0170H
tzcnt edx, edx
add eax, edx
jmp SHORT G_M000_IG27
G_M000_IG17: ;; offset=0178H
xor edx, edx
tzcnt edx, r8d
add eax, edx
jmp SHORT G_M000_IG27
G_M000_IG18: ;; offset=0183H
tzcnt edx, edx
add eax, edx
jmp SHORT G_M000_IG27
G_M000_IG19: ;; offset=018BH
jmp SHORT G_M000_IG27
G_M000_IG20: ;; offset=018DH
inc eax
jmp SHORT G_M000_IG27
G_M000_IG21: ;; offset=0191H
add eax, 2
jmp SHORT G_M000_IG27
G_M000_IG22: ;; offset=0196H
add eax, 3
jmp SHORT G_M000_IG27
G_M000_IG23: ;; offset=019BH
add eax, 4
jmp SHORT G_M000_IG27
G_M000_IG24: ;; offset=01A0H
add eax, 5
jmp SHORT G_M000_IG27
G_M000_IG25: ;; offset=01A5H
add eax, 6
jmp SHORT G_M000_IG27
G_M000_IG26: ;; offset=01AAH
add eax, 7
G_M000_IG27: ;; offset=01ADH
vzeroupper
add rsp, 40
ret
G_M000_IG28: ;; offset=01B5H
call [System.SpanHelpers:ThrowMustBeNullTerminatedString()]
int3
; Total bytes of code 444
|
Also, removed from ; Assembly listing for method System.SpanHelpers:SequenceCompareTo(byref,int,byref,int):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 7 single block inlinees; 0 inlinees without PGO data
G_M000_IG01: ;; offset=0000H
push rsi
vzeroupper
G_M000_IG02: ;; offset=0004H
cmp rcx, r8
je G_M000_IG16
G_M000_IG03: ;; offset=000DH
cmp edx, r9d
mov eax, r9d
cmovb eax, edx
mov r10d, eax
xor r11d, r11d
mov rax, r10
cmp rax, 32
jb SHORT G_M000_IG08
add rax, -32
je SHORT G_M000_IG05
G_M000_IG04: ;; offset=002BH
vmovups ymm0, ymmword ptr [rcx+r11]
vpcmpeqb ymm0, ymm0, ymmword ptr [r8+r11]
vpmovmskb r10d, ymm0
cmp r10d, -1
jne SHORT G_M000_IG06
add r11, 32
cmp rax, r11
ja SHORT G_M000_IG04
G_M000_IG05: ;; offset=004AH
mov r11, rax
vmovups ymm0, ymmword ptr [rcx+r11]
vpcmpeqb ymm0, ymm0, ymmword ptr [r8+r11]
vpmovmskb r10d, ymm0
cmp r10d, -1
je G_M000_IG16
G_M000_IG06: ;; offset=0067H
mov eax, r10d
not eax
tzcnt eax, eax
add rax, r11
mov r11, rax
movzx rax, byte ptr [rcx+r11]
movzx rcx, byte ptr [r8+r11]
sub eax, ecx
G_M000_IG07: ;; offset=0082H
vzeroupper
pop rsi
ret
G_M000_IG08: ;; offset=0087H
cmp r10, 16
jb SHORT G_M000_IG12
add rax, -16
je SHORT G_M000_IG09
vmovups xmm0, xmmword ptr [rcx]
vpcmpeqb xmm0, xmm0, xmmword ptr [r8]
vpmovmskb r10d, xmm0
cmp r10d, 0xFFFF
jne SHORT G_M000_IG10
G_M000_IG09: ;; offset=00A9H
mov r11, rax
vmovups xmm0, xmmword ptr [rcx+r11]
vpcmpeqb xmm0, xmm0, xmmword ptr [r8+r11]
vpmovmskb r10d, xmm0
cmp r10d, 0xFFFF
je SHORT G_M000_IG16
G_M000_IG10: ;; offset=00C5H
mov eax, r10d
not eax
tzcnt eax, eax
add rax, r11
mov r11, rax
movzx rax, byte ptr [rcx+r11]
movzx rcx, byte ptr [r8+r11]
sub eax, ecx
G_M000_IG11: ;; offset=00E0H
vzeroupper
pop rsi
ret
G_M000_IG12: ;; offset=00E5H
cmp r10, 8
jbe SHORT G_M000_IG14
lea rax, [r10-08H]
test rax, rax
je SHORT G_M000_IG14
G_M000_IG13: ;; offset=00F4H
mov rsi, qword ptr [rcx+r11]
cmp rsi, qword ptr [r8+r11]
jne SHORT G_M000_IG14
add r11, 8
cmp rax, r11
ja SHORT G_M000_IG13
G_M000_IG14: ;; offset=0107H
cmp r10, r11
jbe SHORT G_M000_IG16
G_M000_IG15: ;; offset=010CH
movzx rax, byte ptr [rcx+r11]
movzx rsi, byte ptr [r8+r11]
sub eax, esi
jne SHORT G_M000_IG18
inc r11
cmp r10, r11
ja SHORT G_M000_IG15
G_M000_IG16: ;; offset=0122H
mov eax, edx
sub eax, r9d
G_M000_IG17: ;; offset=0127H
vzeroupper
pop rsi
ret
G_M000_IG18: ;; offset=012CH
vzeroupper
pop rsi
ret
; Total bytes of code 305 |
a2dea04
to
4c583a2
Compare
Also contributes to #71261 |
@@ -422,7 +422,6 @@ public static unsafe int SequenceCompareTo(ref char first, int firstLength, ref | |||
|
|||
// IndexOfNullCharacter processes memory in aligned chunks, and thus it won't crash even if it accesses memory beyond the null terminator. | |||
// This behavior is an implementation detail of the runtime and callers outside System.Private.CoreLib must not depend on it. | |||
[MethodImpl(MethodImplOptions.AggressiveOptimization)] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are some more SpanHelpers methods marked with AggressiveOptimization
. Delete it on all of them?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are some more SpanHelpers methods marked with
AggressiveOptimization
. Delete it on all of them?
There are a few cases when because of SVM we get a non optimal codegen in R2R (similar to #84421 (comment)) so I didn't want to regress SpanHelpers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
E.g. System.SpanHelpers:LastIndexOfValueType[short,System.SpanHelpers+DontNegate
1[short]](byref,short,int)`
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So only *IndexOfAnyValue*
(with generic math) are left with AggressiveOptimization
in SpanHelpers
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both R2R and Tier0 codegen for these compilated generic constructs tend to be pretty bad. I would not worry about it - we have the same problem in number of other places.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
E.g. if I remove [AO]
from LastIndexOfValueType
here is what I get:
; Assembly listing for method System.SpanHelpers:LastIndexOfValueType[short](byref,short,int):int
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; ReadyToRun compilation
; optimized code
; rsp based frame
; fully interruptible
; No PGO data
G_M000_IG01: ;; offset=0000H
G_M000_IG02: ;; offset=0000H
movsx rdx, dx
lea rax, [(reloc 0x435488)]
G_M000_IG03: ;; offset=000BH
tail.jmp [rax]System.SpanHelpers:LastIndexOfValueType[short,System.SpanHelpers+DontNegate`1[short]](byref,short,int):int
; Total bytes of code 14
and that nested LastIndexOfValueType
is jit-compiled. so presumably we'll get a slow Tier0 version instead of having AggressiveOpt one for start - if that is ok I can remove
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it is ok.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, removed them. Thus, we only have 3 uses of AO in the corelib - 1 in AsyncTaskMethodBuilder
that says that AO
helps it to avoid allocations in T0. And two in CastHelpers which have to be there since VM special case them to be direct calls
@EgorBo - do you know why |
I think the comparison was apples vs oranges... they were running different code because EventSource startup gunk was being invoked in one case and not the other. |
I'd say it is #85791 (comment) |
Got it. Yes @TIHan also confirmed that in #85791 (comment). |
Contributes to #85791 (removes a couple of methods jitted during Hello World start).
I don't see a good reason for these to have [AO], R2R'd versions look good enough to me (SSE based):
R2R codegen: