-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Promote implicit-byref struct arguments #10453
Conversation
85719ca
to
e2e6a83
Compare
8ee2ff7
to
8949728
Compare
d1a2219
to
168bdf5
Compare
e3b2862
to
065080a
Compare
This is ready for review now (though not planning to merge it until after the 2.0 stabilization, so leaving the "no merge" tag for now). I've split the change into a few supporting commits, then one that does the phase reordering (but with the heuristic set to avoid actually following through with implicit byref promotions), and lastly a change that adjusts the heuristic. I've verified that all changes but the last produce no asm diffs. |
Jit-diff results (release --frameworks --tests):
|
The diff totals seem odd. Full Fx + Test is surely more than 79 assemblies and 65690 methods...? |
Results in span perf benchmarks are positive. The
The
As an example, here's what the diffs look like in diff --git a/quicksort-base.txt b/quicksort-diff.txt
index 3e58d14..7d2c92a 100644
--- a/quicksort-base.txt
+++ b/quicksort-diff.txt
@@ -5,13 +5,13 @@
; fully interruptible
; Final local variable assignments
;
-; V00 arg0 [V00,T00] ( 58,181.50) byref -> rsi ld-addr-op
+; V00 arg0 [V00,T05] ( 32, 8 ) byref -> rcx ld-addr-op
;* V01 loc0 [V01 ] ( 0, 0 ) int -> zero-ref
-; V02 loc1 [V02,T07] ( 6, 3 ) int -> rax
-; V03 loc2 [V03,T02] ( 21, 86.25) int -> rdi
-; V04 loc3 [V04,T01] ( 17, 96.50) int -> rdx
-; V05 loc4 [V05,T03] ( 5, 33.50) int -> rcx
-; V06 loc5 [V06,T06] ( 4, 5 ) int -> r8
+; V02 loc1 [V02,T10] ( 5, 2.50) int -> rcx
+; V03 loc2 [V03,T01] ( 21, 86.25) int -> rbx
+; V04 loc3 [V04,T00] ( 17, 99.50) int -> rax
+; V05 loc4 [V05,T04] ( 5, 33.50) int -> rdx
+; V06 loc5 [V06,T06] ( 6, 7.50) int -> r8
;* V07 tmp0 [V07 ] ( 0, 0 ) struct (16) zero-ref
;* V08 tmp1 [V08 ] ( 0, 0 ) struct (16) zero-ref
;* V09 tmp2 [V09 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op
@@ -19,31 +19,32 @@
;* V11 tmp4 [V11 ] ( 0, 0 ) byref -> zero-ref ld-addr-op
;* V12 tmp5 [V12 ] ( 0, 0 ) byref -> zero-ref
;* V13 tmp6 [V13 ] ( 0, 0 ) struct ( 8) zero-ref
-; V14 tmp7 [V14,T05] ( 6, 6 ) int -> rcx
+; V14 tmp7 [V14,T07] ( 4, 4 ) int -> rcx
;* V15 tmp8 [V15 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op
;* V16 tmp9 [V16 ] ( 0, 0 ) struct (16) zero-ref
;* V17 tmp10 [V17 ] ( 0, 0 ) byref -> zero-ref ld-addr-op
-; V18 tmp11 [V18,T12] ( 2, 2 ) byref -> rdx
+; V18 tmp11 [V18,T11] ( 2, 2 ) byref -> rax
;* V19 tmp12 [V19 ] ( 0, 0 ) struct ( 8) zero-ref
-; V20 tmp13 [V20,T11] ( 3, 3 ) int -> rcx ld-addr-op
-; V21 tmp14 [V21,T13] ( 2, 1 ) byref -> rcx V07._pointer(offs=0x00) P-INDEP
-; V22 tmp15 [V22,T21] ( 2, 1 ) int -> rax V07._length(offs=0x08) P-INDEP
-; V23 tmp16 [V23,T14] ( 2, 1 ) byref -> rdx V08._pointer(offs=0x00) P-INDEP
-; V24 tmp17 [V24,T22] ( 2, 1 ) int -> rcx V08._length(offs=0x08) P-INDEP
-; V25 tmp18 [V25,T15] ( 2, 1 ) byref -> rcx V09._value(offs=0x00) P-INDEP
-; V26 tmp19 [V26,T16] ( 2, 1 ) byref -> rcx V10._pointer(offs=0x00) P-INDEP
-; V27 tmp20 [V27,T23] ( 2, 1 ) int -> rax V10._length(offs=0x08) P-INDEP
-; V28 tmp21 [V28,T17] ( 2, 1 ) byref -> rcx V13._value(offs=0x00) P-INDEP
-; V29 tmp22 [V29,T18] ( 2, 1 ) byref -> rdx V15._value(offs=0x00) P-INDEP
-; V30 tmp23 [V30,T19] ( 2, 1 ) byref -> rdx V16._pointer(offs=0x00) P-INDEP
-; V31 tmp24 [V31,T24] ( 2, 1 ) int -> rax V16._length(offs=0x08) P-INDEP
-; V32 tmp25 [V32,T20] ( 2, 1 ) byref -> rdx V19._value(offs=0x00) P-INDEP
-; V33 tmp26 [V33 ] ( 6, 8 ) struct (16) [rsp+0x20] do-not-enreg[XSB] must-init addr-exposed
-; V34 tmp27 [V34,T09] ( 3, 3 ) byref -> rdx stack-byref
-; V35 tmp28 [V35,T10] ( 3, 3 ) byref -> rax stack-byref
-; V36 OutArgs [V36 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
-; V37 cse0 [V37,T04] ( 8, 6 ) int -> rcx
-; V38 cse1 [V38,T08] ( 6, 3 ) int -> rax
+; V20 tmp13 [V20,T12] ( 2, 2 ) int -> rcx ld-addr-op
+; V21 tmp14 [V21,T02] ( 18, 52 ) byref -> rsi V35._pointer(offs=0x00) P-INDEP
+; V22 tmp15 [V22,T03] ( 14, 44.25) int -> rdi V35._length(offs=0x08) P-INDEP
+;* V23 tmp16 [V23,T20] ( 0, 0 ) byref -> zero-ref V07._pointer(offs=0x00) P-INDEP
+; V24 tmp17 [V24,T16] ( 2, 1 ) int -> rcx V07._length(offs=0x08) P-INDEP
+; V25 tmp18 [V25,T13] ( 2, 1 ) byref -> rax V08._pointer(offs=0x00) P-INDEP
+; V26 tmp19 [V26,T17] ( 2, 1 ) int -> rcx V08._length(offs=0x08) P-INDEP
+;* V27 tmp20 [V27 ] ( 0, 0 ) byref -> zero-ref V09._value(offs=0x00) P-INDEP
+;* V28 tmp21 [V28,T21] ( 0, 0 ) byref -> zero-ref V10._pointer(offs=0x00) P-INDEP
+; V29 tmp22 [V29,T18] ( 2, 1 ) int -> rcx V10._length(offs=0x08) P-INDEP
+;* V30 tmp23 [V30,T22] ( 0, 0 ) byref -> zero-ref V13._value(offs=0x00) P-INDEP
+;* V31 tmp24 [V31 ] ( 0, 0 ) byref -> zero-ref V15._value(offs=0x00) P-INDEP
+; V32 tmp25 [V32,T14] ( 2, 1 ) byref -> rax V16._pointer(offs=0x00) P-INDEP
+; V33 tmp26 [V33,T19] ( 2, 1 ) int -> rdx V16._length(offs=0x08) P-INDEP
+; V34 tmp27 [V34,T15] ( 2, 1 ) byref -> rax V19._value(offs=0x00) P-INDEP
+;* V35 tmp28 [V35 ] ( 0, 0 ) struct (16) zero-ref
+; V36 tmp29 [V36 ] ( 6, 8 ) struct (16) [rsp+0x20] do-not-enreg[XSB] must-init addr-exposed
+; V37 tmp30 [V37,T08] ( 3, 3 ) byref -> rax stack-byref
+; V38 tmp31 [V38,T09] ( 3, 3 ) byref -> rdx stack-byref
+; V39 OutArgs [V39 ] ( 1, 1 ) lclBlk (32) [rsp+0x00]
;
; Lcl frame size = 48
@@ -55,11 +56,11 @@ G_M61353_IG01:
33C0 xor rax, rax
4889442420 mov qword ptr [rsp+20H], rax
4889442428 mov qword ptr [rsp+28H], rax
- 488BF1 mov rsi, rcx
G_M61353_IG02:
- 8B4E08 mov ecx, dword ptr [rsi+8]
- 83F901 cmp ecx, 1
+ 488B31 mov rsi, bword ptr [rcx]
+ 8B7908 mov edi, dword ptr [rcx+8]
+ 83FF01 cmp edi, 1
7F08 jg SHORT G_M61353_IG04
G_M61353_IG03:
@@ -70,120 +71,100 @@ G_M61353_IG03:
C3 ret
G_M61353_IG04:
- 8D41FF lea eax, [rcx-1]
- 33FF xor edi, edi
- 8BD0 mov edx, eax
- 3BD1 cmp edx, ecx
- 0F834A010000 jae G_M61353_IG17
- 488B0E mov rcx, bword ptr [rsi]
- 4C63C2 movsxd r8, edx
- 428B0C81 mov ecx, dword ptr [rcx+4*r8]
- E98E000000 jmp G_M61353_IG10
+ 8D4FFF lea ecx, [rdi-1]
+ 33DB xor ebx, ebx
+ 8BC1 mov eax, ecx
+ 3BC7 cmp eax, edi
+ 0F83EF000000 jae G_M61353_IG17
+ 4863D0 movsxd rdx, eax
+ 8B1496 mov edx, dword ptr [rsi+4*rdx]
+ EB66 jmp SHORT G_M61353_IG10
G_M61353_IG05:
- FFC7 inc edi
+ FFC3 inc ebx
G_M61353_IG06:
- 3BFA cmp edi, edx
- 7D1A jge SHORT G_M61353_IG08
- 3B7E08 cmp edi, dword ptr [rsi+8]
- 0F832B010000 jae G_M61353_IG17
- 4C8B06 mov r8, bword ptr [rsi]
- 4C63CF movsxd r9, edi
- 43390C88 cmp dword ptr [r8+4*r9], ecx
- 7EE5 jle SHORT G_M61353_IG05
+ 3BD8 cmp ebx, eax
+ 7D16 jge SHORT G_M61353_IG08
+ 3BDF cmp ebx, edi
+ 0F83D9000000 jae G_M61353_IG17
+ 4C63C3 movsxd r8, ebx
+ 42391486 cmp dword ptr [rsi+4*r8], edx
+ 7EE9 jle SHORT G_M61353_IG05
EB03 jmp SHORT G_M61353_IG08
G_M61353_IG07:
- FFCA dec edx
+ FFC8 dec eax
G_M61353_IG08:
- 3BD7 cmp edx, edi
- 7E16 jle SHORT G_M61353_IG09
- 3B5608 cmp edx, dword ptr [rsi+8]
- 0F830D010000 jae G_M61353_IG17
- 4C8B06 mov r8, bword ptr [rsi]
- 4C63CA movsxd r9, edx
- 43390C88 cmp dword ptr [r8+4*r9], ecx
- 7DE5 jge SHORT G_M61353_IG07
+ 3BC3 cmp eax, ebx
+ 7E12 jle SHORT G_M61353_IG09
+ 3BC7 cmp eax, edi
+ 0F83BF000000 jae G_M61353_IG17
+ 4C63C0 movsxd r8, eax
+ 42391486 cmp dword ptr [rsi+4*r8], edx
+ 7DE9 jge SHORT G_M61353_IG07
G_M61353_IG09:
- 3BFA cmp edi, edx
- 7D4F jge SHORT G_M61353_IG10
- 3B7E08 cmp edi, dword ptr [rsi+8]
- 0F83F3000000 jae G_M61353_IG17
- 4C8B06 mov r8, bword ptr [rsi]
- 4C63CF movsxd r9, edi
- 478B0488 mov r8d, dword ptr [r8+4*r9]
- 3B7E08 cmp edi, dword ptr [rsi+8]
- 0F83E0000000 jae G_M61353_IG17
- 4C8B0E mov r9, bword ptr [rsi]
- 4C63D7 movsxd r10, edi
- 3B5608 cmp edx, dword ptr [rsi+8]
- 0F83D1000000 jae G_M61353_IG17
- 4C8B1E mov r11, bword ptr [rsi]
- 4863DA movsxd rbx, edx
- 458B1C9B mov r11d, dword ptr [r11+4*rbx]
- 47891C91 mov dword ptr [r9+4*r10], r11d
- 3B5608 cmp edx, dword ptr [rsi+8]
- 0F83BA000000 jae G_M61353_IG17
- 4C8B0E mov r9, bword ptr [rsi]
- 4C63D2 movsxd r10, edx
- 47890491 mov dword ptr [r9+4*r10], r8d
+ 3BD8 cmp ebx, eax
+ 7D30 jge SHORT G_M61353_IG10
+ 3BDF cmp ebx, edi
+ 0F83A9000000 jae G_M61353_IG17
+ 4C63C3 movsxd r8, ebx
+ 468B0486 mov r8d, dword ptr [rsi+4*r8]
+ 4C63CB movsxd r9, ebx
+ 3BC7 cmp eax, edi
+ 0F8397000000 jae G_M61353_IG17
+ 4C63D0 movsxd r10, eax
+ 468B1496 mov r10d, dword ptr [rsi+4*r10]
+ 4689148E mov dword ptr [rsi+4*r9], r10d
+ 4C63C8 movsxd r9, eax
+ 4689048E mov dword ptr [rsi+4*r9], r8d
G_M61353_IG10:
- 3BFA cmp edi, edx
- 0F8C72FFFFFF jl G_M61353_IG06
- 3BF8 cmp edi, eax
- 7436 je SHORT G_M61353_IG11
- 3B7E08 cmp edi, dword ptr [rsi+8]
- 0F8398000000 jae G_M61353_IG17
- 4C8B06 mov r8, bword ptr [rsi]
- 4863D7 movsxd rdx, edi
- 458B0490 mov r8d, dword ptr [r8+4*rdx]
- 3B7E08 cmp edi, dword ptr [rsi+8]
- 0F8385000000 jae G_M61353_IG17
- 488B16 mov rdx, bword ptr [rsi]
- 4C63CF movsxd r9, edi
- 42890C8A mov dword ptr [rdx+4*r9], ecx
- 3B4608 cmp eax, dword ptr [rsi+8]
- 7376 jae SHORT G_M61353_IG17
- 488B0E mov rcx, bword ptr [rsi]
- 4863C0 movsxd rax, eax
- 44890481 mov dword ptr [rcx+4*rax], r8d
+ 3BD8 cmp ebx, eax
+ 7C9E jl SHORT G_M61353_IG06
+ 3BD9 cmp ebx, ecx
+ 7419 je SHORT G_M61353_IG11
+ 3BDF cmp ebx, edi
+ 7375 jae SHORT G_M61353_IG17
+ 4C63C3 movsxd r8, ebx
+ 468B0486 mov r8d, dword ptr [rsi+4*r8]
+ 4863C3 movsxd rax, ebx
+ 891486 mov dword ptr [rsi+4*rax], edx
+ 4863C9 movsxd rcx, ecx
+ 4489048E mov dword ptr [rsi+4*rcx], r8d
G_M61353_IG11:
- 837E0800 cmp dword ptr [rsi+8], 0
- 725A jb SHORT G_M61353_IG15
- 3B7E08 cmp edi, dword ptr [rsi+8]
- 7755 ja SHORT G_M61353_IG15
+ 85FF test edi, edi
+ 7251 jb SHORT G_M61353_IG15
+ 3BDF cmp ebx, edi
+ 774D ja SHORT G_M61353_IG15
G_M61353_IG12:
- 488B0E mov rcx, bword ptr [rsi]
- 8BC7 mov eax, edi
- 488D542420 lea rdx, bword ptr [rsp+20H]
- 48890A mov bword ptr [rdx], rcx
- 894208 mov dword ptr [rdx+8], eax
+ 8BCB mov ecx, ebx
+ 488D442420 lea rax, bword ptr [rsp+20H]
+ 488930 mov bword ptr [rax], rsi
+ 894808 mov dword ptr [rax+8], ecx
488D4C2420 lea rcx, bword ptr [rsp+20H]
- E89EA5FFFF call SpanBench:TestQuickSortSpan(struct)
- 8D4F01 lea ecx, [rdi+1]
- 8B4608 mov eax, dword ptr [rsi+8]
- 3BC8 cmp ecx, eax
- 7736 ja SHORT G_M61353_IG16
+ E8F4A5FFFF call SpanBench:TestQuickSortSpan(struct)
+ 8D4B01 lea ecx, [rbx+1]
+ 3BCF cmp ecx, edi
+ 7734 ja SHORT G_M61353_IG16
G_M61353_IG13:
- 488B16 mov rdx, bword ptr [rsi]
- 4C63C1 movsxd r8, ecx
- 4A8D1482 lea rdx, bword ptr [rdx+4*r8]
- 2BC1 sub eax, ecx
- 8BC8 mov ecx, eax
- 8BC1 mov eax, ecx
- 8BC8 mov ecx, eax
- 488D442420 lea rax, bword ptr [rsp+20H]
- 488910 mov bword ptr [rax], rdx
- 894808 mov dword ptr [rax+8], ecx
+ 4863C1 movsxd rax, ecx
+ 488D0486 lea rax, bword ptr [rsi+4*rax]
+ 8BD7 mov edx, edi
+ 2BD1 sub edx, ecx
+ 8BCA mov ecx, edx
+ 8BD1 mov edx, ecx
+ 8BCA mov ecx, edx
+ 488D542420 lea rdx, bword ptr [rsp+20H]
+ 488902 mov bword ptr [rdx], rax
+ 894A08 mov dword ptr [rdx+8], ecx
488D4C2420 lea rcx, bword ptr [rsp+20H]
- E86DA5FFFF call SpanBench:TestQuickSortSpan(struct)
+ E8C7A5FFFF call SpanBench:TestQuickSortSpan(struct)
90 nop
G_M61353_IG14:
@@ -194,16 +175,16 @@ G_M61353_IG14:
C3 ret
G_M61353_IG15:
- E81F915256 call ThrowHelper:ThrowArgumentOutOfRangeException()
+ E879915156 call ThrowHelper:ThrowArgumentOutOfRangeException()
G_M61353_IG16:
- E81A915256 call ThrowHelper:ThrowArgumentOutOfRangeException()
+ E874915156 call ThrowHelper:ThrowArgumentOutOfRangeException()
CC int3
G_M61353_IG17:
- E8A4AEAC5F call CORINFO_HELP_RNGCHKFAIL
+ E8FEAEAB5F call CORINFO_HELP_RNGCHKFAIL
CC int3
-; Total bytes of code 381, prolog size 22 for method SpanBench:TestQuickSortSpan(struct)
+; Total bytes of code 291, prolog size 19 for method SpanBench:TestQuickSortSpan(struct)
; ============================================================
-TestQuickSortSpan(1024): 41.2957ms
+TestQuickSortSpan(1024): 35.1321ms |
@briansull PTAL |
Right you are, I forgot to pass --recursive to jit-analyze. Will update... |
jit-diff results with --recursive:
|
There is a weighted ref count that might be a better input to the profitability heuristic. I presume it's not set up yet (though perhaps with IBC?). If we work on having plausible block weights earlier in the phase ordering (which I'd like to do so the inliner could use them) then we should remember to revisit this. |
You're right that the weighted ref counts are not set up yet, and I definitely agree that we should use them in this heuristic if/when we make them available there. |
@briansull @dotnet/jit-contrib ping |
src/jit/morph.cpp
Outdated
{ | ||
hasAddrExposedVars = true; | ||
break; | ||
} | ||
if (varDsc->lvPromoted && varDsc->lvIsParam) | ||
else if (varDsc->lvAddrExposed) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure why you changed this to an else if
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you're just seeing github's diff lining up the wrong lines. I changed
if (A || B) { // note the disjunction here
X;
}
if (C) {
Y;
}
to
if (A) { // no longer disjunction
X;
} else if (B) { // because second clause needs different handling now
Z;
}
if (C && !D) {
Y;
}
so what was an if
is still an if
, not an else-if
(it's just down on line 7738 now)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, I think I see that, but my point was that the previous if
still ends in a break;
so there's no need for the else if (just as before). However, I'm pretty sure it will make exactly zero difference in the generated code.
In reply to: 116548706 [](ancestors = 116548706)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, got it now. Yeah, I was just mechanically splitting the A || B
without thinking about the implications of the break
. Looking at it now, I think it reads better w/o else
, will update.
This looks good to me. I'm a little concerned about the special cases in |
UNIX_AMD64_STRUCT_PASSING doesn't have implicit byref arguments, so it can use the same early-outs as other targets that don't have them.
Since we are specifically looking for parameters, we don't need to consult any entries past `compArgsCount`.
This will be necessary when an implicit byref would otherwise be an HFA reg arg, when we rewrite the arg to a pointer.
The check for struct types needs to match the check in lvMarkImplicitByRefLocals, which uses varTypeIsStruct (and therefore includes SIMD types), instead of just using TYP_STRUCT.
Change the phase ordering with respect to address-exposed analysis, struct promotion, and implicit byref rewriting. This is done to subject implicit byref parameters to struct promotion; when an implicit byref is to be promoted, generate a new local of the struct type, promote it, and insert code at method entry to initialize the new local by struct-copy from the parameter. As part of the reordering, run address-exposed analysis before implicit byref rewriting, so that the results of address-exposed analysis can be consulted in determining whether promoting each implicit byref is beneficial (in light of the cost of the struct copy), and effectively avoid promoting those which are not deemed beneficial. This change treats all implicit byref promotions as NOT beneficial; it reorders the phases but produces no asm diffs. A subsequent change will identify beneficial implicit byref promotions. To make this work, the implicit byref rewriting happens in a few stages: - fgMarkImplicitByRefArgs now does nothing more than flag which args are implicit byrefs, and runs before struct promotion. - fgRetypeImplicitByRefArgs actually retypes the parameters (from struct to pointer), and decides which promotions are beneficial. It inserts the initializing struct copies for beneficial ones, and annotates non-beneficial ones so their appearances will be rewritten via the pointer parameter. It runs after struct promotion and address-escape analysis. - fgMorphImplicitByRefArgs is moved out of fgMarkAddressExposedLocals, and into fgMorphBlocks. It rewrites any implicit byref parameters which have not been promoted to have the extra indirection at all their references. - fgMarkDemotedImplicitByRefArgs runs after fgMorphBlocks, and cleans up some of the LclVarDsc annotations used to communicate across the previous pieces.
Remove the artificial limitation marking them address-taken. Add a hueristic that avoids promoting if there are too few references to the parameter.
065080a
to
0814907
Compare
@dotnet-bot test this please |
This change extends struct promotion to "implicit byref" parameters (i.e. struct which are passed by value at source level but for which the ABI dictates that the caller make a copy in its frame and pass the callee a pointer to the copy). Struct promotion is how we do field-sensitive analysis, so this enables propagating facts about
Span<T>
parameters' lengths to eliminate redundant checks, and similar.Struct promotion annotations exist on local var descriptors, which poses an issue here because the incoming struct memory is not a local of the callee. To address this, promotion of implicit byref parameters is implemented by:
Aside from field-sensitive propagation of values etc., a beneficial effect on the generated code (that falls out of the above approach) is that the references throughout the method body are references to locals instead of pointer dereferences, so this change results in elimination of many more redundant struct field loads.
The struct copy to initialize the new local of course incurs a cost that must be weighed against the benefits of exposing the individual fields and improved redundant load elimination. This change uses a simple heuristic that checks whether a parameter is address-exposed, and also compares the number of references to the parameter in the method body against the number of fields; it effectively avoids doing the promotion for parameters that are address-exposed or are not heavily referenced.
Avoiding the struct copy by adding the capacity to reason about promoted structs that don't reside in the current method's frame (and recognize these "restrict" pointer parameters' referents as not killed by arbitrary pointer dereferences, etc.) could alleviate the costs, as could optimizations like PDE or rematerialization to move the copies to more optimal code paths; these are left as possible future improvements.
This necessitated a change in phase ordering; the implicit byref rewriting now happens in a few stages:
implicit byrefs, and runs before struct promotion.
to pointer), and decides which promotions are beneficial. It inserts
the initializing struct copies for beneficial ones, and annotates
non-beneficial ones so their appearances will be rewritten via the
pointer parameter. It runs after struct promotion and address-escape
analysis.
and into fgMorphBlocks. It rewrites any implicit byref parameters
which have not been promoted to have the extra indirection at all their
references.
some of the LclVarDsc annotations used to communicate across the
previous pieces.