-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Perf -24%] System.Memory.Span<Int32>.SequenceCompareTo #2286
Comments
I spent some time investigating this one and I came up with same conclusion as in #2319. On my local machine, the benchmark appears faster with 32B alignment changes. With 32B alignment:
_M65109_IG01: ;; offset=0000H
00007ff8`a2107a40 4156 push r14
00007ff8`a2107a42 57 push rdi
00007ff8`a2107a43 56 push rsi
00007ff8`a2107a44 55 push rbp
00007ff8`a2107a45 53 push rbx
00007ff8`a2107a46 4883EC30 sub rsp, 48
00007ff8`a2107a4a 488BF1 mov rsi, rcx
00007ff8`a2107a4d 8BEA mov ebp, edx
00007ff8`a2107a4f 498BF8 mov rdi, r8
00007ff8`a2107a52 418BD9 mov ebx, r9d
;; bbWeight=1 PerfScore 6.25
G_M65109_IG02: ;; offset=0015H
00007ff8`a2107a55 8BCD mov ecx, ebp
00007ff8`a2107a57 F7D1 not ecx
00007ff8`a2107a59 C1E91F shr ecx, 31
00007ff8`a2107a5c 48BA6030B2E477020000 mov rdx, 0x277E4B23060
; =========================== 32B boundary ===========================
; ....
; ....
00007ff8`a2107a9e 85C0 test eax, eax
; =========================== 32B boundary ===========================
00007ff8`a2107aa0 7E33 jle SHORT G_M65109_IG16
;; bbWeight=1 PerfScore 1.50
G_M65109_IG09: ;; offset=0062H
00007ff8`a2107aa2 4863CA movsxd rcx, edx
00007ff8`a2107aa5 448B048F mov r8d, dword ptr [rdi+4*rcx]
00007ff8`a2107aa9 488D0C8E lea rcx, bword ptr [rsi+4*rcx]
00007ff8`a2107aad 443901 cmp dword ptr [rcx], r8d
00007ff8`a2107ab0 7D08 jge SHORT G_M65109_IG11
;; bbWeight=4 PerfScore 23.00
G_M65109_IG10: ;; offset=0072H
00007ff8`a2107ab2 41B9FFFFFFFF mov r9d, -1
00007ff8`a2107ab8 EB10 jmp SHORT G_M65109_IG14
;; bbWeight=2 PerfScore 4.50
G_M65109_IG11: ;; offset=007AH
00007ff8`a2107aba 443901 cmp dword ptr [rcx], r8d ; <--- hot instruction
00007ff8`a2107abd 7E08 jle SHORT G_M65109_IG13
;; bbWeight=1 PerfScore 3.00
G_M65109_IG12: ;; offset=007FH
00007ff8`a2107abf 41B901000000 mov r9d, 1
; =========================== 32B boundary ===========================
00007ff8`a2107ac5 EB03 jmp SHORT G_M65109_IG14
;; bbWeight=2 PerfScore 4.50
G_M65109_IG13: ;; offset=0087H
00007ff8`a2107ac7 4533C9 xor r9d, r9d ; <--- hot instruction
;; bbWeight=2 PerfScore 0.50
G_M65109_IG14: ;; offset=008AH
00007ff8`a2107aca 4585C9 test r9d, r9d
00007ff8`a2107acd 7511 jne SHORT G_M65109_IG17
;; bbWeight=4 PerfScore 5.00
G_M65109_IG15: ;; offset=008FH
00007ff8`a2107acf FFC2 inc edx
00007ff8`a2107ad1 3BD0 cmp edx, eax
00007ff8`a2107ad3 7CCD jl SHORT G_M65109_IG09
;; bbWeight=4 PerfScore 6.00
G_M65109_IG16: ;; offset=0095H
00007ff8`a2107ad5 3BEB cmp ebp, ebx
00007ff8`a2107ad7 7D15 jge SHORT G_M65109_IG19
00007ff8`a2107ad9 B8FFFFFFFF mov eax, -1
00007ff8`a2107ade EB1B jmp SHORT G_M65109_IG22
; =========================== 32B boundary =========================== Without 32B alignment:
G_M65109_IG01: ;; offset=0000H
00007ff8`a20d7990 4156 push r14
00007ff8`a20d7992 57 push rdi
00007ff8`a20d7993 56 push rsi
00007ff8`a20d7994 55 push rbp
00007ff8`a20d7995 53 push rbx
00007ff8`a20d7996 4883EC30 sub rsp, 48
00007ff8`a20d799a 488BF1 mov rsi, rcx
00007ff8`a20d799d 8BEA mov ebp, edx
00007ff8`a20d799f 498BF8 mov rdi, r8
; =========================== 32B boundary ===========================
00007ff8`a20d79a2 418BD9 mov ebx, r9d
; ....
; ....
;; bbWeight=1 PerfScore 1.50
G_M65109_IG09: ;; offset=0062H
00007ff8`a20d79f2 4863CA movsxd rcx, edx
00007ff8`a20d79f5 448B048F mov r8d, dword ptr [rdi+4*rcx]
00007ff8`a20d79f9 488D0C8E lea rcx, bword ptr [rsi+4*rcx]
00007ff8`a20d79fd 443901 cmp dword ptr [rcx], r8d
; =========================== 32B boundary ===========================
00007ff8`a20d7a00 7D08 jge SHORT G_M65109_IG11
;; bbWeight=4 PerfScore 23.00
G_M65109_IG10: ;; offset=0072H
00007ff8`a20d7a02 41B9FFFFFFFF mov r9d, -1
00007ff8`a20d7a08 EB10 jmp SHORT G_M65109_IG14
;; bbWeight=2 PerfScore 4.50
G_M65109_IG11: ;; offset=007AH
00007ff8`a20d7a0a 443901 cmp dword ptr [rcx], r8d ; <--- hot instruction
00007ff8`a20d7a0d 7E08 jle SHORT G_M65109_IG13
;; bbWeight=1 PerfScore 3.00
G_M65109_IG12: ;; offset=007FH
00007ff8`a20d7a0f 41B901000000 mov r9d, 1
00007ff8`a20d7a15 EB03 jmp SHORT G_M65109_IG14
;; bbWeight=2 PerfScore 4.50
G_M65109_IG13: ;; offset=0087H
00007ff8`a20d7a17 4533C9 xor r9d, r9d ; <--- hot instruction
;; bbWeight=2 PerfScore 0.50
G_M65109_IG14: ;; offset=008AH
00007ff8`a20d7a1a 4585C9 test r9d, r9d
00007ff8`a20d7a1d 7511 jne SHORT G_M65109_IG17
;; bbWeight=4 PerfScore 5.00
G_M65109_IG15: ;; offset=008FH
00007ff8`a20d7a1f FFC2 inc edx
; =========================== 32B boundary =========================== Looking at the vtune profiler, it looks like the 2 hot instructions Cc: @adamsitnik , @AndyAyersMS |
Run Information
Regressions in System.Memory.Span
Historical Data in Reporting System
Repro
Histogram
System.Memory.Span.SequenceCompareTo(Size: 512)
Docs
Profiling workflow for dotnet/runtime repository
Benchmarking workflow for dotnet/runtime repository
The text was updated successfully, but these errors were encountered: