-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Conversation
Before ; Total bytes of code 21, prolog size 5 for method Dictionary`2:TryAdd(char,ref):bool:this ; Lcl frame size = 40
G_M28467_IG01:
57 push rdi
56 push rsi
4883EC28 sub rsp, 40
488BF1 mov rsi, rcx
498BF8 mov rdi, r8
G_M28467_IG02:
0FB7D2 movzx rdx, dx
488BCE mov rcx, rsi
E800000000 call Dictionary`2:FindEntry(char):int:this
85C0 test eax, eax
7C29 jl SHORT G_M28467_IG04
488B5610 mov rdx, gword ptr [rsi+16]
3B4208 cmp eax, dword ptr [rdx+8]
732C jae SHORT G_M28467_IG06
4863C8 movsxd rcx, eax
488D0C49 lea rcx, [rcx+2*rcx]
488B54CA10 mov rdx, gword ptr [rdx+8*rcx+16]
488BCF mov rcx, rdi
E800000000 call CORINFO_HELP_CHECKED_ASSIGN_REF
B801000000 mov eax, 1
G_M28467_IG03:
4883C428 add rsp, 40
5E pop rsi
5F pop rdi
C3 ret
G_M28467_IG04:
33C0 xor rax, rax
488907 mov qword ptr [rdi], rax
G_M28467_IG05:
4883C428 add rsp, 40
5E pop rsi
5F pop rdi
C3 ret
G_M28467_IG06:
E800000000 call CORINFO_HELP_RNGCHKFAIL
CC int3 After ; Assembly listing for method Dictionary`2:TryGetValue(char,byref):bool:this ; Lcl frame size = 48
G_M28467_IG01:
56 push rsi
4883EC30 sub rsp, 48
498BF0 mov rsi, r8
G_M28467_IG02:
0FB7D2 movzx rdx, dx
4C8D442428 lea r8, bword ptr [rsp+28H]
E800000000 call Dictionary`2:FindEntry(char,byref):byref:this
488B10 mov rdx, gword ptr [rax]
488BCE mov rcx, rsi
E800000000 call CORINFO_HELP_CHECKED_ASSIGN_REF
8B442428 mov eax, dword ptr [rsp+28H]
0FB6C0 movzx rax, al
G_M28467_IG03:
4883C430 add rsp, 48
5E pop rsi
C3 ret |
After is longer as it switches on the nullness of comparer to use Before ; Assembly listing for method Dictionary`2:FindEntry(char):int:this ; Lcl frame size = 48
G_M38464_IG01:
4156 push r14
57 push rdi
56 push rsi
55 push rbp
53 push rbx
4883EC30 sub rsp, 48
488BF1 mov rsi, rcx
G_M38464_IG02:
48837E0800 cmp gword ptr [rsi+8], 0
0F848C000000 je G_M38464_IG05
488B4E18 mov rcx, gword ptr [rsi+24]
0FB7FA movzx rdi, dx
8BD7 mov edx, edi
4C8D1D00000000 lea r11, [(reloc)]
3909 cmp dword ptr [rcx], ecx
41FF13 call qword ptr [r11]IEqualityComparer`1:GetHashCode(char):int:this
8BD8 mov ebx, eax
81E3FFFFFF7F and ebx, 0x7FFFFFFF
488B4608 mov rax, gword ptr [rsi+8]
488BC8 mov rcx, rax
448B4008 mov r8d, dword ptr [rax+8]
8BC3 mov eax, ebx
99 cdq
41F7F8 idiv edx:eax, r8d
413BD0 cmp edx, r8d
7376 jae SHORT G_M38464_IG09
4863D2 movsxd rdx, edx
8B6C9110 mov ebp, dword ptr [rcx+4*rdx+16]
85ED test ebp, ebp
7C4E jl SHORT G_M38464_IG05
G_M38464_IG03:
488B4E10 mov rcx, gword ptr [rsi+16]
3B6908 cmp ebp, dword ptr [rcx+8]
7362 jae SHORT G_M38464_IG09
4863D5 movsxd rdx, ebp
4C8D3452 lea r14, [rdx+2*rdx]
4A8D4CF110 lea rcx, bword ptr [rcx+8*r14+16]
395908 cmp dword ptr [rcx+8], ebx
7522 jne SHORT G_M38464_IG04
488B5618 mov rdx, gword ptr [rsi+24]
440FB74110 movzx r8, word ptr [rcx+16]
488BCA mov rcx, rdx
418BD0 mov edx, r8d
448BC7 mov r8d, edi
4C8D1D00000000 lea r11, [(reloc)]
3909 cmp dword ptr [rcx], ecx
41FF13 call qword ptr [r11]IEqualityComparer`1:Equals(char,char):bool:this
85C0 test eax, eax
7522 jne SHORT G_M38464_IG07
G_M38464_IG04:
488B4610 mov rax, gword ptr [rsi+16]
3B6808 cmp ebp, dword ptr [rax+8]
7326 jae SHORT G_M38464_IG09
428B6CF01C mov ebp, dword ptr [rax+8*r14+28]
85ED test ebp, ebp
7DB2 jge SHORT G_M38464_IG03
G_M38464_IG05:
B8FFFFFFFF mov eax, -1
G_M38464_IG06:
4883C430 add rsp, 48
5B pop rbx
5D pop rbp
5E pop rsi
5F pop rdi
415E pop r14
C3 ret
G_M38464_IG07:
8BC5 mov eax, ebp
G_M38464_IG08:
4883C430 add rsp, 48
5B pop rbx
5D pop rbp
5E pop rsi
5F pop rdi
415E pop r14
C3 ret
G_M38464_IG09:
E800000000 call CORINFO_HELP_RNGCHKFAIL
CC int3
; Total bytes of code 199, prolog size 13 for method Dictionary`2:FindEntry(char):int:this After ; Assembly listing for method Dictionary`2:FindEntry(char,byref):byref:this ; Lcl frame size = 40
G_M38470_IG01:
4157 push r15
4156 push r14
57 push rdi
56 push rsi
55 push rbp
53 push rbx
4883EC28 sub rsp, 40
488BF1 mov rsi, rcx
498BF8 mov rdi, r8
G_M38470_IG02:
C60701 mov byte ptr [rdi], 1
48837E0800 cmp gword ptr [rsi+8], 0
0F84B0000000 je G_M38470_IG08
488B4E18 mov rcx, gword ptr [rsi+24]
4885C9 test rcx, rcx
7413 je SHORT G_M38470_IG03
0FB7DA movzx rbx, dx
8BD3 mov edx, ebx
4C8D1D00000000 lea r11, [(reloc)]
3909 cmp dword ptr [rcx], ecx
41FF13 call qword ptr [r11]IEqualityComparer`1:GetHashCode(char):int:this
EB0C jmp SHORT G_M38470_IG04
G_M38470_IG03:
0FB7DA movzx rbx, dx
8BC3 mov eax, ebx
8BD0 mov edx, eax
C1E210 shl edx, 16
0BC2 or eax, edx
G_M38470_IG04:
8BE8 mov ebp, eax
81E5FFFFFF7F and ebp, 0x7FFFFFFF
488B4E08 mov rcx, gword ptr [rsi+8]
448B4108 mov r8d, dword ptr [rcx+8]
8BC5 mov eax, ebp
99 cdq
41F7F8 idiv edx:eax, r8d
413BD0 cmp edx, r8d
0F8392000000 jae G_M38470_IG12
4863D2 movsxd rdx, edx
448B749110 mov r14d, dword ptr [rcx+4*rdx+16]
4585F6 test r14d, r14d
7C5C jl SHORT G_M38470_IG08
G_M38470_IG05:
488B5610 mov rdx, gword ptr [rsi+16]
443B7208 cmp r14d, dword ptr [rdx+8]
737B jae SHORT G_M38470_IG12
4963CE movsxd rcx, r14d
488D0C49 lea rcx, [rcx+2*rcx]
4C8D7CCA10 lea r15, bword ptr [rdx+8*rcx+16]
41396F08 cmp dword ptr [r15+8], ebp
7537 jne SHORT G_M38470_IG07
488B4E18 mov rcx, gword ptr [rsi+24]
4885C9 test rcx, rcx
7511 jne SHORT G_M38470_IG06
410FB75710 movzx rdx, word ptr [r15+16]
3BD3 cmp edx, ebx
0F94C2 sete dl
0FB6D2 movzx rdx, dl
85D2 test edx, edx
753F jne SHORT G_M38470_IG10
G_M38470_IG06:
4885C9 test rcx, rcx
7418 je SHORT G_M38470_IG07
410FB75710 movzx rdx, word ptr [r15+16]
448BC3 mov r8d, ebx
4C8D1D00000000 lea r11, [(reloc)]
3909 cmp dword ptr [rcx], ecx
41FF13 call qword ptr [r11]IEqualityComparer`1:Equals(char,char):bool:this
85C0 test eax, eax
7522 jne SHORT G_M38470_IG10
G_M38470_IG07:
458B770C mov r14d, dword ptr [r15+12]
4585F6 test r14d, r14d
7DA4 jge SHORT G_M38470_IG05
G_M38470_IG08:
C60700 mov byte ptr [rdi], 0
488BCE mov rcx, rsi
E800000000 call Dictionary`2:get_NotFound():byref:this
90 nop
G_M38470_IG09:
4883C428 add rsp, 40
5B pop rbx
5D pop rbp
5E pop rsi
5F pop rdi
415E pop r14
415F pop r15
C3 ret
G_M38470_IG10:
498BC7 mov rax, r15
G_M38470_IG11:
4883C428 add rsp, 40
5B pop rbx
5D pop rbp
5E pop rsi
5F pop rdi
415E pop r14
415F pop r15
C3 ret
G_M38470_IG12:
E800000000 call CORINFO_HELP_RNGCHKFAIL
CC int3
; Total bytes of code 255, prolog size 18 for method Dictionary`2:FindEntry(char,byref):byref:this |
PTAL @jkotas @AndyAyersMS @mikedn |
Benchmarks? I'd like to see evidence that the significant increase in size of It's also not very clear if changing |
I am preparing almost the same PR. |
buckets[targetBucket] = index; | ||
version++; | ||
|
||
// If we hit the collision threshold we'll need to switch to the comparer which is using randomized string hashing | ||
// i.e. EqualityComparer<string>.Default. | ||
|
||
if (collisionCount > HashHelpers.HashCollisionThreshold && comparer is NonRandomizedStringEqualityComparer) | ||
if (typeof(TKey) == typeof(string) && collisionCount > HashHelpers.HashCollisionThreshold && comparer is NonRandomizedStringEqualityComparer) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not think typeof(TKey) == typeof(string)
check is going to help anything here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jkotas If I am reading that properly you are killing the whole branch when TKey
is not an string and struct. You are also adding a branching check when TKey
is a class. Could be a win if and only if the check introduced doesnt make its performance worse on the class path.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right. Maybe the check can stay - but it should be after collisionCount > HashHelpers.HashCollisionThreshold
to not regress the class path.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct on that front.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change change it to a struct check?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How would you do that in that case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (default(TKey) == null && collisionCount > HashHelpers.HashCollisionThreshold ...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice trick :D
@@ -349,22 +378,36 @@ public virtual void GetObjectData(SerializationInfo info, StreamingContext conte | |||
} | |||
} | |||
|
|||
private int FindEntry(TKey key) | |||
private ref Entry FindEntry(TKey key, out bool found) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with @mikendn - this change should be evaluated separately from the Comparer change (submitting it as separate PR would be best).
if (buckets != null) | ||
{ | ||
int hashCode = comparer.GetHashCode(key) & 0x7FFFFFFF; | ||
for (int i = buckets[hashCode % buckets.Length]; i >= 0; i = entries[i].next) | ||
int hashCode = (comparer == null ? EqualityComparer<TKey>.Default.GetHashCode(key) : comparer.GetHashCode(key)) & 0x7FFFFFFF; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can call GetHashCode
directly on the key as enums don't box anymore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And I would extract it to separate methods. Though it creates one more copy for complex keys but inlining of EqualityComparer<TKey>.Default
methods already introduces a couple of copies.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private int GetKeyHashCode(TKey key)
{
if (default(TKey) != null)
{
return comparer != null ? comparer.GetHashCode(key) : key.GetHashCode();
}
else
{
return comparer.GetHashCode(key);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool KeysEqual(TKey x, TKey y)
{
if (default(TKey) != null)
{
return comparer != null ? comparer.Equals(x, y) : EqualityComparer<TKey>.Default.Equals(x, y);
}
else
{
return comparer.Equals(x, y);
}
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Be careful about the class path ... AggressiveInlining is not always respected for it because of the runtime does not know how to inline the complex generic dictionary access that it introduces.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm.. it works for int
as the key but I haven't yet checked tuples, complex structs and reference types.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Naive branch prediction (non-loop) prefers continue than jump forward; so using default comparer in the first path (being more common than custom comparer) would be better.
Will change hashcode.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That exactly why I changed the order :)
; comparer != null
488B4E18 mov rcx, gword ptr [rsi+24]
4885C9 test rcx, rcx
750C jne SHORT G_M33318_IG06
; inlined Defaul.Equals start
3BD7 cmp edx, edi
410F94C7 sete r15b
450FB6FF movzx r15, r15b
; inlined Defaul.Equals end
EB15 jmp SHORT G_M33318_IG07
G_M33318_IG06:
448BC7 mov r8d, edi
49BB0001C36DFE070000 mov r11, 0x7FE6DC30100
3909 cmp dword ptr [rcx], ecx
41FF13 call qword ptr [r11]System.Collections.Generic.IEqualityComparer`1[Int32][System.Int32]:Equals(int,int):bool:this
448BF8 mov r15d, eax
G_M33318_IG07:
4585FF test r15d, r15d
751A jne SHORT G_M33318_IG11
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh my days, you are right, the compilation reverses the tests 😢
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And yes, it doesn't inline for Dictionary
2[__Canon, Int32]` 😢
call System.Collections.Generic.Dictionary`2[__Canon,Int32][System.__Canon,System.Int32]:KeysEqual(ref,ref):bool:this
{ | ||
if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key)) | ||
ref Entry candidateEntry = ref entries[i]; | ||
if (candidateEntry.hashCode == hashCode && comparer.Equals(candidateEntry.key, key)) | ||
{ | ||
if (behavior == InsertionBehavior.OverwriteExisting) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@benaadams Here you can use the marker struct interface on the caller to evict the whole comparison in the loop.
On the call site use TryInsert<OverwriteExisting>(...)
and TryInsert<ThrowOnExisting>(...)
and generate 2 separate codepath for each.
EDIT: Dont know how much you can get from it, but it is worth to measure it as the call will get smaller.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did that in #14030; however its a bit trickier as dictionary is heavily crossgened and the whole stack is AoT R2R (rather than Jit) for the runtime store which cuts out some optimizations and introduces speed bumps for the class path.
I think the better approach is actually split the method and not use the flag
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In that case, probably, yes.
Private methods like |
@redknightlois |
@omariom there is a tension between the cost of a call, what the inliner can do with a big method. In our codebase experience, if the following checks are true, you are highly skewed over the wining (worth to check):
If those are true, even if the inliner doesn't do a great job you end up winning. The side effect is that the code is still organized logically but each method has its own contained code. You lose a small amount because of cache misses (hardly an issue if you are switching back and forth on operations on a hash-table) at the expense of high performance on repeated operations patterns. EDIT: Last sentence probably should be phrased the other way around to be clearer. |
The forced aggressive inlining works great for microbenchmarks, but it tends to hurt for real workloads. We care about code size for very frequently types like Dictionaries: the code for these types is specialized many times. Larger code size for them makes startup slower, and it does not always help steady state either because things won't fit into caches anymore. This is really a problem for tiered JIT to solve ... only aggressively inline specializations that are very hot in given workload. |
@jkotas While I wholeheartedly agree with both statements (Tiered and Microbenchmarks), our measurements suggest that while algorithmic improvements will give a certain edge (we use a different algorithm for our dictionary to get rid of These are on-the-wild measurements (not microbenchmarks) on high traffic call sites: https://ayende.com/blog/177377/fast-dictionary-and-struct-generic-arguments The case without the struct metaprogramming trick is still 2x over the stock one; hardly microbenchmark noise or overfitting. |
Looked at in #14030; but since the dictionary is heavily crossgened; these inlines are rejected due to "complex handle lookup"; and very common dictionary types are already crossgened. So can't degrade the AoT path; even though its better when it Jits at runtime - so is a bit of a balance |
@@ -72,10 +74,11 @@ public Dictionary(int capacity, IEqualityComparer<TKey> comparer) | |||
{ | |||
if (capacity < 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.capacity); | |||
if (capacity > 0) Initialize(capacity); | |||
this.comparer = comparer ?? EqualityComparer<TKey>.Default; | |||
this.comparer = comparer; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can be :
if (comparer != EqualityComparer<TKey>.Default)
{
this.comparer = comparer;
}
So when comparer
param happens to be EqualityComparer<TKey>.Default
we don't lose the optimization
{ | ||
if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key)) return i; | ||
ref Entry candidateEntry = ref entries[i]; | ||
if (candidateEntry.hashCode == hashCode && ((comparer == null && EqualityComparer<TKey>.Default.Equals(candidateEntry.key, key)) || (comparer != null && comparer.Equals(candidateEntry.key, key)))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently it checks comparer
twice:
test rcx, rcx
jne SHORT G_M38470_IG06
;...
test rcx, rcx
je SHORT G_M38470_IG07
If you simplify it to
&& (comparer != null ? comparer.Equals(candidateEntry.key, key) : EqualityComparer<TKey>.Default.Equals(candidateEntry.key, key)
it will remove the second check.
Prefer #15419 |
Less adventurous than #14030
EqualityComparer<TKey>.Default
; default comparer isnull
rather than via interface