Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Auto merge of #122059 - nyurik:with-as-const-str, r=<try>
Optimize write with as_const_str for shorter code Following up on #121001 Apparently this code generates significant code block for each call to `write()` with non-simple formatting string - approx 100 lines of assembly code, possibly due to `dyn` (?). See generated assembly code [here](https://github.com/nyurik/rust-optimize-format-str/compare/before-changes..with-my-change#diff-6b404e954c692d8cdc8c452d819a216aa5dcf40522b5944639e9ad947279a477): <details><summary>Details</summary> <p> This is the inlining of `write!(buffer, "Iteration {value} was written")` ```asm core::fmt::Write::write_fmt: // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 194 fn write_fmt(&mut self, args: Arguments<'_>) -> Result { push r15 push r14 push r13 push r12 push rbx mov rdx, rsi // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 427 match (self.pieces, self.args) { mov rcx, qword ptr [rsi + 8] mov rax, qword ptr [rsi + 24] // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 428 ([], []) => Some(""), cmp rcx, 1 je .LBB0_8 test rcx, rcx jne .LBB0_9 test rax, rax jne .LBB0_9 // /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911 self.buf.reserve(self.len, additional); lea r12, [rdi + 16] lea rsi, [rip + .L__unnamed_2] xor ebx, ebx .LBB0_6: mov r14, qword ptr [r12] jmp .LBB0_7 .LBB0_8: // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 429 ([s], []) => Some(s), test rax, rax je .LBB0_4 .LBB0_9: // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 1108 if let Some(s) = args.as_str() { output.write_str(s) } else { write_internal(output, args) } lea rsi, [rip + .L__unnamed_1] pop rbx pop r12 pop r13 pop r14 pop r15 jmp qword ptr [rip + core::fmt::write_internal@GOTPCREL] .LBB0_4: mov rax, qword ptr [rdx] // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 429 ([s], []) => Some(s), mov rsi, qword ptr [rax] mov rbx, qword ptr [rax + 8] // /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 248 if T::IS_ZST { usize::MAX } else { self.cap.0 } mov rax, qword ptr [rdi] // /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911 self.buf.reserve(self.len, additional); mov r14, qword ptr [rdi + 16] // /home/nyurik/dev/rust/rust/library/core/src/num/mod.rs : 1281 uint_impl! { sub rax, r14 // /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 392 additional > self.capacity().wrapping_sub(len) cmp rax, rbx // /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 309 if self.needs_to_grow(len, additional) { jb .LBB0_5 .LBB0_7: mov rax, qword ptr [rdi + 8] // /home/nyurik/dev/rust/rust/library/core/src/ptr/mut_ptr.rs : 1046 unsafe { intrinsics::offset(self, count) } add rax, r14 mov r15, rdi // /home/nyurik/dev/rust/rust/library/core/src/intrinsics.rs : 2922 copy_nonoverlapping(src, dst, count) mov rdi, rax mov rdx, rbx call qword ptr [rip + memcpy@GOTPCREL] // /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 2040 self.len += count; add r14, rbx mov qword ptr [r15 + 16], r14 // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 216 } xor eax, eax pop rbx pop r12 pop r13 pop r14 pop r15 ret .LBB0_5: // /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911 self.buf.reserve(self.len, additional); lea r12, [rdi + 16] mov r15, rdi mov r13, rsi // /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 310 do_reserve_and_handle(self, len, additional); mov rsi, r14 mov rdx, rbx call alloc::raw_vec::RawVec<T,A>::reserve::do_reserve_and_handle mov rsi, r13 mov rdi, r15 jmp .LBB0_6 ``` </p> </details> ```rust #[inline] pub fn write(output: &mut dyn Write, args: Arguments<'_>) -> Result { if let Some(s) = args.as_str() { output.write_str(s) } else { write_internal(output, args) } } ``` So, this brings back the older experiment - where I used `if core::intrinsics::is_val_statically_known(s.is_some()) { s } else { None }` helper function, and called it in multiple places that used `write`. This is not as optimal because now every user of `write` must do this logic, but at least it results in significantly smaller assembly code for the formatting case, and results in identical code as now for the "simple" (no formatting) case. See [assembly comparison](https://github.com/nyurik/rust-optimize-format-str/compare/with-my-change..with-as-const-str#diff-6b404e954c692d8cdc8c452d819a216aa5dcf40522b5944639e9ad947279a477) of what is now with what this change brings (focus only on `fmt/intel-lib.txt` and `str/intel-lib.txt` files). ```rust if let Some(s) = args.as_const_str() { self.write_str(s) } else { write(self, args) } ```
- Loading branch information