Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

Improve Dictionary<K,T> CQ - Take 2 #15411

Closed
wants to merge 4 commits into from
Closed

Conversation

benaadams
Copy link
Member

@benaadams benaadams commented Dec 7, 2017

Less adventurous than #14030

  • Ref locals in TryInsert
  • Ref return FindEntry
  • Use EqualityComparer<TKey>.Default; default comparer is null rather than via interface
  • Move null key check to inlinable methods, so can be elided

@benaadams
Copy link
Member Author

Before

; Total bytes of code 21, prolog size 5 for method Dictionary`2:TryAdd(char,ref):bool:this

; Lcl frame size = 40

G_M28467_IG01:
       57                   push     rdi
       56                   push     rsi
       4883EC28             sub      rsp, 40
       488BF1               mov      rsi, rcx
       498BF8               mov      rdi, r8

G_M28467_IG02:
       0FB7D2               movzx    rdx, dx
       488BCE               mov      rcx, rsi
       E800000000           call     Dictionary`2:FindEntry(char):int:this
       85C0                 test     eax, eax
       7C29                 jl       SHORT G_M28467_IG04
       488B5610             mov      rdx, gword ptr [rsi+16]
       3B4208               cmp      eax, dword ptr [rdx+8]
       732C                 jae      SHORT G_M28467_IG06
       4863C8               movsxd   rcx, eax
       488D0C49             lea      rcx, [rcx+2*rcx]
       488B54CA10           mov      rdx, gword ptr [rdx+8*rcx+16]
       488BCF               mov      rcx, rdi
       E800000000           call     CORINFO_HELP_CHECKED_ASSIGN_REF
       B801000000           mov      eax, 1

G_M28467_IG03:
       4883C428             add      rsp, 40
       5E                   pop      rsi
       5F                   pop      rdi
       C3                   ret      

G_M28467_IG04:
       33C0                 xor      rax, rax
       488907               mov      qword ptr [rdi], rax

G_M28467_IG05:
       4883C428             add      rsp, 40
       5E                   pop      rsi
       5F                   pop      rdi
       C3                   ret      

G_M28467_IG06:
       E800000000           call     CORINFO_HELP_RNGCHKFAIL
       CC                   int3     

After

; Assembly listing for method Dictionary`2:TryGetValue(char,byref):bool:this

; Lcl frame size = 48

G_M28467_IG01:
       56                   push     rsi
       4883EC30             sub      rsp, 48
       498BF0               mov      rsi, r8

G_M28467_IG02:
       0FB7D2               movzx    rdx, dx
       4C8D442428           lea      r8, bword ptr [rsp+28H]
       E800000000           call     Dictionary`2:FindEntry(char,byref):byref:this
       488B10               mov      rdx, gword ptr [rax]
       488BCE               mov      rcx, rsi
       E800000000           call     CORINFO_HELP_CHECKED_ASSIGN_REF
       8B442428             mov      eax, dword ptr [rsp+28H]
       0FB6C0               movzx    rax, al

G_M28467_IG03:
       4883C430             add      rsp, 48
       5E                   pop      rsi
       C3                   ret      

@benaadams
Copy link
Member Author

benaadams commented Dec 7, 2017

After is longer as it switches on the nullness of comparer to use EqualityComparer<TKey>.Default which is an intrinsic and inline and doesn't call via interface

Before

; Assembly listing for method Dictionary`2:FindEntry(char):int:this

; Lcl frame size = 48

G_M38464_IG01:
       4156                 push     r14
       57                   push     rdi
       56                   push     rsi
       55                   push     rbp
       53                   push     rbx
       4883EC30             sub      rsp, 48
       488BF1               mov      rsi, rcx

G_M38464_IG02:
       48837E0800           cmp      gword ptr [rsi+8], 0
       0F848C000000         je       G_M38464_IG05
       488B4E18             mov      rcx, gword ptr [rsi+24]
       0FB7FA               movzx    rdi, dx
       8BD7                 mov      edx, edi
       4C8D1D00000000       lea      r11, [(reloc)]
       3909                 cmp      dword ptr [rcx], ecx
       41FF13               call     qword ptr [r11]IEqualityComparer`1:GetHashCode(char):int:this
       8BD8                 mov      ebx, eax
       81E3FFFFFF7F         and      ebx, 0x7FFFFFFF
       488B4608             mov      rax, gword ptr [rsi+8]
       488BC8               mov      rcx, rax
       448B4008             mov      r8d, dword ptr [rax+8]
       8BC3                 mov      eax, ebx
       99                   cdq      
       41F7F8               idiv     edx:eax, r8d
       413BD0               cmp      edx, r8d
       7376                 jae      SHORT G_M38464_IG09
       4863D2               movsxd   rdx, edx
       8B6C9110             mov      ebp, dword ptr [rcx+4*rdx+16]
       85ED                 test     ebp, ebp
       7C4E                 jl       SHORT G_M38464_IG05

G_M38464_IG03:
       488B4E10             mov      rcx, gword ptr [rsi+16]
       3B6908               cmp      ebp, dword ptr [rcx+8]
       7362                 jae      SHORT G_M38464_IG09
       4863D5               movsxd   rdx, ebp
       4C8D3452             lea      r14, [rdx+2*rdx]
       4A8D4CF110           lea      rcx, bword ptr [rcx+8*r14+16]
       395908               cmp      dword ptr [rcx+8], ebx
       7522                 jne      SHORT G_M38464_IG04
       488B5618             mov      rdx, gword ptr [rsi+24]
       440FB74110           movzx    r8, word  ptr [rcx+16]
       488BCA               mov      rcx, rdx
       418BD0               mov      edx, r8d
       448BC7               mov      r8d, edi
       4C8D1D00000000       lea      r11, [(reloc)]
       3909                 cmp      dword ptr [rcx], ecx
       41FF13               call     qword ptr [r11]IEqualityComparer`1:Equals(char,char):bool:this
       85C0                 test     eax, eax
       7522                 jne      SHORT G_M38464_IG07

G_M38464_IG04:
       488B4610             mov      rax, gword ptr [rsi+16]
       3B6808               cmp      ebp, dword ptr [rax+8]
       7326                 jae      SHORT G_M38464_IG09
       428B6CF01C           mov      ebp, dword ptr [rax+8*r14+28]
       85ED                 test     ebp, ebp
       7DB2                 jge      SHORT G_M38464_IG03

G_M38464_IG05:
       B8FFFFFFFF           mov      eax, -1

G_M38464_IG06:
       4883C430             add      rsp, 48
       5B                   pop      rbx
       5D                   pop      rbp
       5E                   pop      rsi
       5F                   pop      rdi
       415E                 pop      r14
       C3                   ret      

G_M38464_IG07:
       8BC5                 mov      eax, ebp

G_M38464_IG08:
       4883C430             add      rsp, 48
       5B                   pop      rbx
       5D                   pop      rbp
       5E                   pop      rsi
       5F                   pop      rdi
       415E                 pop      r14
       C3                   ret      

G_M38464_IG09:
       E800000000           call     CORINFO_HELP_RNGCHKFAIL
       CC                   int3     

; Total bytes of code 199, prolog size 13 for method Dictionary`2:FindEntry(char):int:this

After

; Assembly listing for method Dictionary`2:FindEntry(char,byref):byref:this

; Lcl frame size = 40

G_M38470_IG01:
       4157                 push     r15
       4156                 push     r14
       57                   push     rdi
       56                   push     rsi
       55                   push     rbp
       53                   push     rbx
       4883EC28             sub      rsp, 40
       488BF1               mov      rsi, rcx
       498BF8               mov      rdi, r8

G_M38470_IG02:
       C60701               mov      byte  ptr [rdi], 1
       48837E0800           cmp      gword ptr [rsi+8], 0
       0F84B0000000         je       G_M38470_IG08
       488B4E18             mov      rcx, gword ptr [rsi+24]
       4885C9               test     rcx, rcx
       7413                 je       SHORT G_M38470_IG03
       0FB7DA               movzx    rbx, dx
       8BD3                 mov      edx, ebx
       4C8D1D00000000       lea      r11, [(reloc)]
       3909                 cmp      dword ptr [rcx], ecx
       41FF13               call     qword ptr [r11]IEqualityComparer`1:GetHashCode(char):int:this
       EB0C                 jmp      SHORT G_M38470_IG04

G_M38470_IG03:
       0FB7DA               movzx    rbx, dx
       8BC3                 mov      eax, ebx
       8BD0                 mov      edx, eax
       C1E210               shl      edx, 16
       0BC2                 or       eax, edx

G_M38470_IG04:
       8BE8                 mov      ebp, eax
       81E5FFFFFF7F         and      ebp, 0x7FFFFFFF
       488B4E08             mov      rcx, gword ptr [rsi+8]
       448B4108             mov      r8d, dword ptr [rcx+8]
       8BC5                 mov      eax, ebp
       99                   cdq      
       41F7F8               idiv     edx:eax, r8d
       413BD0               cmp      edx, r8d
       0F8392000000         jae      G_M38470_IG12
       4863D2               movsxd   rdx, edx
       448B749110           mov      r14d, dword ptr [rcx+4*rdx+16]
       4585F6               test     r14d, r14d
       7C5C                 jl       SHORT G_M38470_IG08

G_M38470_IG05:
       488B5610             mov      rdx, gword ptr [rsi+16]
       443B7208             cmp      r14d, dword ptr [rdx+8]
       737B                 jae      SHORT G_M38470_IG12
       4963CE               movsxd   rcx, r14d
       488D0C49             lea      rcx, [rcx+2*rcx]
       4C8D7CCA10           lea      r15, bword ptr [rdx+8*rcx+16]
       41396F08             cmp      dword ptr [r15+8], ebp
       7537                 jne      SHORT G_M38470_IG07
       488B4E18             mov      rcx, gword ptr [rsi+24]
       4885C9               test     rcx, rcx
       7511                 jne      SHORT G_M38470_IG06
       410FB75710           movzx    rdx, word  ptr [r15+16]
       3BD3                 cmp      edx, ebx
       0F94C2               sete     dl
       0FB6D2               movzx    rdx, dl
       85D2                 test     edx, edx
       753F                 jne      SHORT G_M38470_IG10

G_M38470_IG06:
       4885C9               test     rcx, rcx
       7418                 je       SHORT G_M38470_IG07
       410FB75710           movzx    rdx, word  ptr [r15+16]
       448BC3               mov      r8d, ebx
       4C8D1D00000000       lea      r11, [(reloc)]
       3909                 cmp      dword ptr [rcx], ecx
       41FF13               call     qword ptr [r11]IEqualityComparer`1:Equals(char,char):bool:this
       85C0                 test     eax, eax
       7522                 jne      SHORT G_M38470_IG10

G_M38470_IG07:
       458B770C             mov      r14d, dword ptr [r15+12]
       4585F6               test     r14d, r14d
       7DA4                 jge      SHORT G_M38470_IG05

G_M38470_IG08:
       C60700               mov      byte  ptr [rdi], 0
       488BCE               mov      rcx, rsi
       E800000000           call     Dictionary`2:get_NotFound():byref:this
       90                   nop      

G_M38470_IG09:
       4883C428             add      rsp, 40
       5B                   pop      rbx
       5D                   pop      rbp
       5E                   pop      rsi
       5F                   pop      rdi
       415E                 pop      r14
       415F                 pop      r15
       C3                   ret      

G_M38470_IG10:
       498BC7               mov      rax, r15

G_M38470_IG11:
       4883C428             add      rsp, 40
       5B                   pop      rbx
       5D                   pop      rbp
       5E                   pop      rsi
       5F                   pop      rdi
       415E                 pop      r14
       415F                 pop      r15
       C3                   ret      

G_M38470_IG12:
       E800000000           call     CORINFO_HELP_RNGCHKFAIL
       CC                   int3     

; Total bytes of code 255, prolog size 18 for method Dictionary`2:FindEntry(char,byref):byref:this

@benaadams benaadams changed the title [WIP] Improve Dictionary<K,T> CQ - Take 2 Improve Dictionary<K,T> CQ - Take 2 Dec 7, 2017
@benaadams
Copy link
Member Author

PTAL @jkotas @AndyAyersMS @mikedn

@mikedn
Copy link

mikedn commented Dec 7, 2017

Benchmarks? I'd like to see evidence that the significant increase in size of FindEntry due to default comparer special handling is warranted. I suppose it is but actual numbers would be better than suppositions.

It's also not very clear if changing FindEntry to return ref and adding an out parameter to it is useful. These changes should be evaluated individually based on each change's merit.

@omariom
Copy link

omariom commented Dec 7, 2017

I am preparing almost the same PR.
Perf is better 10-30% for int as the key. But set_Item is worse 13% for the perf test from Perf.Dictionary tests in CoreFX.

buckets[targetBucket] = index;
version++;

// If we hit the collision threshold we'll need to switch to the comparer which is using randomized string hashing
// i.e. EqualityComparer<string>.Default.

if (collisionCount > HashHelpers.HashCollisionThreshold && comparer is NonRandomizedStringEqualityComparer)
if (typeof(TKey) == typeof(string) && collisionCount > HashHelpers.HashCollisionThreshold && comparer is NonRandomizedStringEqualityComparer)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think typeof(TKey) == typeof(string) check is going to help anything here.

Copy link

@redknightlois redknightlois Dec 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jkotas If I am reading that properly you are killing the whole branch when TKey is not an string and struct. You are also adding a branching check when TKey is a class. Could be a win if and only if the check introduced doesnt make its performance worse on the class path.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. Maybe the check can stay - but it should be after collisionCount > HashHelpers.HashCollisionThreshold to not regress the class path.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct on that front.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change change it to a struct check?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would you do that in that case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (default(TKey) == null && collisionCount > HashHelpers.HashCollisionThreshold ...

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice trick :D

@@ -349,22 +378,36 @@ public virtual void GetObjectData(SerializationInfo info, StreamingContext conte
}
}

private int FindEntry(TKey key)
private ref Entry FindEntry(TKey key, out bool found)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @mikendn - this change should be evaluated separately from the Comparer change (submitting it as separate PR would be best).

if (buckets != null)
{
int hashCode = comparer.GetHashCode(key) & 0x7FFFFFFF;
for (int i = buckets[hashCode % buckets.Length]; i >= 0; i = entries[i].next)
int hashCode = (comparer == null ? EqualityComparer<TKey>.Default.GetHashCode(key) : comparer.GetHashCode(key)) & 0x7FFFFFFF;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can call GetHashCode directly on the key as enums don't box anymore.

Copy link

@omariom omariom Dec 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And I would extract it to separate methods. Though it creates one more copy for complex keys but inlining of EqualityComparer<TKey>.Default methods already introduces a couple of copies.

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private int GetKeyHashCode(TKey key)
        {
            if (default(TKey) != null)
            {
                return comparer != null ? comparer.GetHashCode(key) : key.GetHashCode();
            }
            else
            {
                return comparer.GetHashCode(key);
            }
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        private bool KeysEqual(TKey x, TKey y)
        {
            if (default(TKey) != null)
            {
                return comparer != null ? comparer.Equals(x, y) : EqualityComparer<TKey>.Default.Equals(x, y);
            }
            else
            {
                return comparer.Equals(x, y);
            }
        }

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Be careful about the class path ... AggressiveInlining is not always respected for it because of the runtime does not know how to inline the complex generic dictionary access that it introduces.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm.. it works for intas the key but I haven't yet checked tuples, complex structs and reference types.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naive branch prediction (non-loop) prefers continue than jump forward; so using default comparer in the first path (being more common than custom comparer) would be better.

Will change hashcode.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That exactly why I changed the order :)

       ; comparer != null
       488B4E18             mov      rcx, gword ptr [rsi+24]
       4885C9               test     rcx, rcx
       750C                 jne      SHORT G_M33318_IG06
       ; inlined Defaul.Equals start
       3BD7                 cmp      edx, edi
       410F94C7             sete     r15b
       450FB6FF             movzx    r15, r15b 
       ; inlined Defaul.Equals end
       EB15                 jmp      SHORT G_M33318_IG07

G_M33318_IG06:
       448BC7               mov      r8d, edi
       49BB0001C36DFE070000 mov      r11, 0x7FE6DC30100
       3909                 cmp      dword ptr [rcx], ecx
       41FF13               call     qword ptr [r11]System.Collections.Generic.IEqualityComparer`1[Int32][System.Int32]:Equals(int,int):bool:this
       448BF8               mov      r15d, eax

G_M33318_IG07:
       4585FF               test     r15d, r15d
       751A                 jne      SHORT G_M33318_IG11

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh my days, you are right, the compilation reverses the tests 😢

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And yes, it doesn't inline for Dictionary2[__Canon, Int32]` 😢

call     System.Collections.Generic.Dictionary`2[__Canon,Int32][System.__Canon,System.Int32]:KeysEqual(ref,ref):bool:this

{
if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key))
ref Entry candidateEntry = ref entries[i];
if (candidateEntry.hashCode == hashCode && comparer.Equals(candidateEntry.key, key))
{
if (behavior == InsertionBehavior.OverwriteExisting)
Copy link

@redknightlois redknightlois Dec 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@benaadams Here you can use the marker struct interface on the caller to evict the whole comparison in the loop.

On the call site use TryInsert<OverwriteExisting>(...) and TryInsert<ThrowOnExisting>(...) and generate 2 separate codepath for each.

EDIT: Dont know how much you can get from it, but it is worth to measure it as the call will get smaller.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did that in #14030; however its a bit trickier as dictionary is heavily crossgened and the whole stack is AoT R2R (rather than Jit) for the runtime store which cuts out some optimizations and introduces speed bumps for the class path.

I think the better approach is actually split the method and not use the flag

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case, probably, yes.

@redknightlois
Copy link

Private methods like FindEntry and TryAdd could be forced inlined if they are not already in all codepaths to diminish the amount of method calls, that has a very high impact on high-frequency calls like the typical dictionary use.

@omariom
Copy link

omariom commented Dec 7, 2017

@redknightlois
ContainsKey is small enough to be auto inlined. Inlining such methods as FindEntry and TryAdd with complex logic and loops will unlikely be beneficial.

@redknightlois
Copy link

redknightlois commented Dec 7, 2017

@omariom there is a tension between the cost of a call, what the inliner can do with a big method. In our codebase experience, if the following checks are true, you are highly skewed over the wining (worth to check):

  • If the caller site is less than 30% of the site
  • If the parent method is likely to be called in high-frequency scenarios.
  • If the method is being used once and in isolation on all call sites (very important for inlining big methods)

If those are true, even if the inliner doesn't do a great job you end up winning. The side effect is that the code is still organized logically but each method has its own contained code. You lose a small amount because of cache misses (hardly an issue if you are switching back and forth on operations on a hash-table) at the expense of high performance on repeated operations patterns.

EDIT: Last sentence probably should be phrased the other way around to be clearer.

@jkotas
Copy link
Member

jkotas commented Dec 7, 2017

complex logic and loops will unlikely be beneficial.

The forced aggressive inlining works great for microbenchmarks, but it tends to hurt for real workloads. We care about code size for very frequently types like Dictionaries: the code for these types is specialized many times. Larger code size for them makes startup slower, and it does not always help steady state either because things won't fit into caches anymore.

This is really a problem for tiered JIT to solve ... only aggressively inline specializations that are very hot in given workload.

@redknightlois
Copy link

@jkotas While I wholeheartedly agree with both statements (Tiered and Microbenchmarks), our measurements suggest that while algorithmic improvements will give a certain edge (we use a different algorithm for our dictionary to get rid of idiv instructions), most of the actual performance work that I needed to do was apply those simple optimizations to get a 2x improvement over the stock dictionary.

These are on-the-wild measurements (not microbenchmarks) on high traffic call sites: https://ayende.com/blog/177377/fast-dictionary-and-struct-generic-arguments

The case without the struct metaprogramming trick is still 2x over the stock one; hardly microbenchmark noise or overfitting.

@benaadams
Copy link
Member Author

Private methods like FindEntry and TryAdd could be forced inlined if they are not already in all codepaths to diminish the amount of method calls

Looked at in #14030; but since the dictionary is heavily crossgened; these inlines are rejected due to "complex handle lookup"; and very common dictionary types are already crossgened. So can't degrade the AoT path; even though its better when it Jits at runtime - so is a bit of a balance

@@ -72,10 +74,11 @@ public Dictionary(int capacity, IEqualityComparer<TKey> comparer)
{
if (capacity < 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.capacity);
if (capacity > 0) Initialize(capacity);
this.comparer = comparer ?? EqualityComparer<TKey>.Default;
this.comparer = comparer;
Copy link

@omariom omariom Dec 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be :

if (comparer != EqualityComparer<TKey>.Default)
{
    this.comparer = comparer;
}

So when comparer param happens to be EqualityComparer<TKey>.Default we don't lose the optimization

{
if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key)) return i;
ref Entry candidateEntry = ref entries[i];
if (candidateEntry.hashCode == hashCode && ((comparer == null && EqualityComparer<TKey>.Default.Equals(candidateEntry.key, key)) || (comparer != null && comparer.Equals(candidateEntry.key, key))))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently it checks comparer twice:

       test     rcx, rcx
       jne      SHORT G_M38470_IG06
       ;...
       test     rcx, rcx
       je       SHORT G_M38470_IG07

If you simplify it to

&& (comparer != null ? comparer.Equals(candidateEntry.key, key) : EqualityComparer<TKey>.Default.Equals(candidateEntry.key, key)

it will remove the second check.

@benaadams benaadams mentioned this pull request Dec 7, 2017
@benaadams
Copy link
Member Author

Prefer #15419

@benaadams benaadams closed this Dec 16, 2017
@benaadams benaadams deleted the dict-cq branch October 14, 2019 00:14
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants