-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
macos tlv workaround #60341
macos tlv workaround #60341
Conversation
Thanks for this! The last commit here looks like the real meat of the PR but I'm not sure that I entirely understand what's going on, it's definitely counterintuitive that a volatile load makes things more performant! Do you know what's happening at the LLVM layer? Do you, for example, have some LLVM IR of before/after to see what's going on? It'd be nice if we could avoid this trick and instead just tweak the codegen slightly to be more amenable to better code generation on the backend. |
The last commit is indeed the meat. The other two commits are just deleting dead code.
Here's some sample code and associated codegen: #[inline(never)]
#[cold]
fn init() -> String {
String::from("hello world. this string is long")
}
pub fn get_the_thread_local() -> String {
thread_local! {
static X: String = init();
}
X.with(|x| x.clone())
} IR _without_ `read_volatile` (`--release`); thread_local::get_the_thread_local
; Function Attrs: uwtable
define void @_ZN12thread_local20get_the_thread_local17h709f40512ac1fd32E(%"alloc::string::String"* noalias nocapture sret dereferenceable(24)) unnamed_addr #1 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
%value.i.i.i = alloca %"alloc::string::String", align 16
%_15.i.i = alloca %"alloc::string::String", align 8
%_3.sroa.6.i = alloca [2 x i64], align 8
%_3.sroa.6.0.sroa_cast9.i = bitcast [2 x i64]* %_3.sroa.6.i to i8*
call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %_3.sroa.6.0.sroa_cast9.i)
%.val.i.i.i.i = load i8, i8* getelementptr inbounds (<{ [32 x i8] }>, <{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E, i64 0, i32 0, i64 25), align 1, !noalias !25
%1 = icmp eq i8 %.val.i.i.i.i, 0
br i1 %1, label %bb7.i.i.i.i, label %bb4.i.i
bb7.i.i.i.i: ; preds = %start
%.val.i.i.i.i.i = load i8, i8* getelementptr inbounds (<{ [32 x i8] }>, <{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E, i64 0, i32 0, i64 24), align 8, !noalias !25
%2 = icmp eq i8 %.val.i.i.i.i.i, 0
br i1 %2, label %bb7.i.i.i.i.i, label %bb11.i.i
bb7.i.i.i.i.i: ; preds = %bb7.i.i.i.i
; call std::sys::unix::fast_thread_local::register_dtor
call void @_ZN3std3sys4unix17fast_thread_local13register_dtor17hea5fd2f6f1030afcE(i8* getelementptr inbounds (<{ [32 x i8] }>, <{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E, i64 0, i32 0, i64 0), void (i8*)* nonnull @_ZN3std6thread5local4fast13destroy_value17h328a717a41c9449aE), !noalias !25
store i8 1, i8* getelementptr inbounds (<{ [32 x i8] }>, <{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E, i64 0, i32 0, i64 24), align 8, !noalias !25
br label %bb11.i.i
bb11.i.i: ; preds = %bb7.i.i.i.i.i, %bb7.i.i.i.i
%3 = bitcast %"alloc::string::String"* %_15.i.i to i8*
call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %3), !noalias !25
%4 = load {}*, {}** bitcast (<{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E to {}**), align 8, !noalias !25
%5 = icmp eq {}* %4, null
br i1 %5, label %bb13.i.i, label %bb15.i.i
bb13.i.i: ; preds = %bb11.i.i
%6 = bitcast %"alloc::string::String"* %value.i.i.i to i8*
call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %6), !noalias !25
; call thread_local::init
call fastcc void @_ZN12thread_local4init17h88c40ced9b6cbe50E(%"alloc::string::String"* noalias nocapture nonnull dereferenceable(24) %value.i.i.i) #7, !noalias !25
%7 = bitcast %"alloc::string::String"* %value.i.i.i to <2 x i64>*
%8 = load <2 x i64>, <2 x i64>* %7, align 16, !noalias !25
%_11.sroa.0.sroa.5.0..sroa_idx21.i.i.i = getelementptr inbounds %"alloc::string::String", %"alloc::string::String"* %value.i.i.i, i64 0, i32 1, i32 3
%_11.sroa.0.sroa.5.0.copyload.i.i.i = load i64, i64* %_11.sroa.0.sroa.5.0..sroa_idx21.i.i.i, align 16, !noalias !25
%z.i.i.i.sroa.0.0.copyload.i.i.i = load {}*, {}** bitcast (<{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E to {}**), align 8, !noalias !30
%z.i.i.i.sroa.4.0.copyload.i.i.i = load i64, i64* bitcast (i8* getelementptr inbounds (<{ [32 x i8] }>, <{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E, i64 0, i32 0, i64 8) to i64*), align 8, !noalias !30
store <2 x i64> %8, <2 x i64>* bitcast (<{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E to <2 x i64>*), align 8, !noalias !34
store i64 %_11.sroa.0.sroa.5.0.copyload.i.i.i, i64* bitcast (i8* getelementptr inbounds (<{ [32 x i8] }>, <{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E, i64 0, i32 0, i64 16) to i64*), align 8, !noalias !34
%9 = icmp eq {}* %z.i.i.i.sroa.0.0.copyload.i.i.i, null
%10 = icmp eq i64 %z.i.i.i.sroa.4.0.copyload.i.i.i, 0
%or.cond.i.i.i = or i1 %9, %10
br i1 %or.cond.i.i.i, label %"_ZN3std6thread5local17LocalKey$LT$T$GT$4init17hf02e3e1800d44f2aE.exit.i.i", label %bb4.i.i.i.i.i.i.i.i.i
bb4.i.i.i.i.i.i.i.i.i: ; preds = %bb13.i.i
%11 = bitcast {}* %z.i.i.i.sroa.0.0.copyload.i.i.i to i8*
tail call void @__rust_dealloc(i8* nonnull %11, i64 %z.i.i.i.sroa.4.0.copyload.i.i.i, i64 1) #7, !noalias !25
br label %"_ZN3std6thread5local17LocalKey$LT$T$GT$4init17hf02e3e1800d44f2aE.exit.i.i"
"_ZN3std6thread5local17LocalKey$LT$T$GT$4init17hf02e3e1800d44f2aE.exit.i.i": ; preds = %bb4.i.i.i.i.i.i.i.i.i, %bb13.i.i
call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %6), !noalias !25
br label %bb15.i.i
bb15.i.i: ; preds = %"_ZN3std6thread5local17LocalKey$LT$T$GT$4init17hf02e3e1800d44f2aE.exit.i.i", %bb11.i.i
; call <alloc::string::String as core::clone::Clone>::clone
call void @"_ZN60_$LT$alloc..string..String$u20$as$u20$core..clone..Clone$GT$5clone17h5e0882d4896c9b5eE"(%"alloc::string::String"* noalias nocapture nonnull sret dereferenceable(24) %_15.i.i, %"alloc::string::String"* noalias nonnull readonly align 8 dereferenceable(24) bitcast (<{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h53cabb331a543277E to %"alloc::string::String"*)), !noalias !25
%_3.sroa.0.0..sroa_cast.i = bitcast %"alloc::string::String"* %_15.i.i to {}**
%_3.sroa.0.0.copyload.i = load {}*, {}** %_3.sroa.0.0..sroa_cast.i, align 8, !noalias !35
%_3.sroa.6.0..sroa_idx.i = getelementptr inbounds %"alloc::string::String", %"alloc::string::String"* %_15.i.i, i64 0, i32 1, i32 1, i32 1
%_3.sroa.6.0..sroa_cast.i = bitcast i64* %_3.sroa.6.0..sroa_idx.i to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %_3.sroa.6.0.sroa_cast9.i, i8* nonnull align 8 %_3.sroa.6.0..sroa_cast.i, i64 16, i1 false), !noalias !35
call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %3), !noalias !25
%12 = icmp eq {}* %_3.sroa.0.0.copyload.i, null
br i1 %12, label %bb4.i.i, label %"_ZN3std6thread5local17LocalKey$LT$T$GT$4with17hf1fd224675692f45E.exit"
bb4.i.i: ; preds = %bb15.i.i, %start
; call core::result::unwrap_failed
tail call fastcc void @_ZN4core6result13unwrap_failed17hf64e0430bdaf4bcaE(), !noalias !36
unreachable
"_ZN3std6thread5local17LocalKey$LT$T$GT$4with17hf1fd224675692f45E.exit": ; preds = %bb15.i.i
%_3.sroa.0.0..sroa_cast2.i = bitcast %"alloc::string::String"* %0 to {}**
store {}* %_3.sroa.0.0.copyload.i, {}** %_3.sroa.0.0..sroa_cast2.i, align 8, !alias.scope !36
%_3.sroa.6.0..sroa_idx6.i = getelementptr inbounds %"alloc::string::String", %"alloc::string::String"* %0, i64 0, i32 1, i32 1, i32 1
%_3.sroa.6.0..sroa_cast7.i = bitcast i64* %_3.sroa.6.0..sroa_idx6.i to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %_3.sroa.6.0..sroa_cast7.i, i8* nonnull align 8 %_3.sroa.6.0.sroa_cast9.i, i64 16, i1 false), !alias.scope !40
call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %_3.sroa.6.0.sroa_cast9.i)
ret void
} IR with `read_volatile` (`--release`); thread_local::get_the_thread_local
; Function Attrs: uwtable
define void @_ZN12thread_local20get_the_thread_local17hceac182570e9506dE(%"alloc::string::String"* noalias nocapture sret dereferenceable(24)) unnamed_addr #1 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
%value.i.i.i = alloca %"alloc::string::String", align 16
%self.i.i.i.i = alloca %"std::thread::local::fast::Key<alloc::string::String>"*, align 8
%_15.i.i = alloca %"alloc::string::String", align 8
%_3.sroa.6.i = alloca [2 x i64], align 8
%_3.sroa.6.0.sroa_cast9.i = bitcast [2 x i64]* %_3.sroa.6.i to i8*
call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %_3.sroa.6.0.sroa_cast9.i)
%self.i.i.i.i.0.sroa_cast = bitcast %"std::thread::local::fast::Key<alloc::string::String>"** %self.i.i.i.i to i8*
call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %self.i.i.i.i.0.sroa_cast)
store %"std::thread::local::fast::Key<alloc::string::String>"* bitcast (<{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h1f31f65c90417afeE to %"std::thread::local::fast::Key<alloc::string::String>"*), %"std::thread::local::fast::Key<alloc::string::String>"** %self.i.i.i.i, align 8, !noalias !25
%self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i = load volatile %"std::thread::local::fast::Key<alloc::string::String>"*, %"std::thread::local::fast::Key<alloc::string::String>"** %self.i.i.i.i, align 8, !noalias !25, !nonnull !0
%1 = getelementptr inbounds %"std::thread::local::fast::Key<alloc::string::String>", %"std::thread::local::fast::Key<alloc::string::String>"* %self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i, i64 0, i32 5
%.val.i.i.i.i = load i8, i8* %1, align 1, !noalias !25
%2 = icmp eq i8 %.val.i.i.i.i, 0
br i1 %2, label %bb8.i.i.i.i, label %"_ZN3std6thread5local17LocalKey$LT$T$GT$8try_with17ha49ffcfe07458aa0E.exit.thread.i"
bb8.i.i.i.i: ; preds = %start
%3 = getelementptr inbounds %"std::thread::local::fast::Key<alloc::string::String>", %"std::thread::local::fast::Key<alloc::string::String>"* %self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i, i64 0, i32 3
%.val.i.i.i.i.i = load i8, i8* %3, align 8, !noalias !25
%4 = icmp eq i8 %.val.i.i.i.i.i, 0
br i1 %4, label %bb7.i.i.i.i.i, label %bb11.i.i
bb7.i.i.i.i.i: ; preds = %bb8.i.i.i.i
%5 = bitcast %"std::thread::local::fast::Key<alloc::string::String>"* %self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i to i8*
; call std::sys::unix::fast_thread_local::register_dtor
call void @_ZN3std3sys4unix17fast_thread_local13register_dtor17ha1a2dbcd05fabbafE(i8* nonnull %5, void (i8*)* nonnull @_ZN3std6thread5local4fast13destroy_value17haead139569d8605fE), !noalias !25
store i8 1, i8* %3, align 8, !noalias !25
br label %bb11.i.i
"_ZN3std6thread5local17LocalKey$LT$T$GT$8try_with17ha49ffcfe07458aa0E.exit.thread.i": ; preds = %start
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %self.i.i.i.i.0.sroa_cast)
br label %bb4.i.i
bb11.i.i: ; preds = %bb7.i.i.i.i.i, %bb8.i.i.i.i
call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %self.i.i.i.i.0.sroa_cast)
%6 = bitcast %"alloc::string::String"* %_15.i.i to i8*
call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %6), !noalias !25
%7 = bitcast %"std::thread::local::fast::Key<alloc::string::String>"* %self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i to {}**
%8 = load {}*, {}** %7, align 8, !noalias !25
%9 = icmp eq {}* %8, null
br i1 %9, label %bb13.i.i, label %bb15.i.i
bb13.i.i: ; preds = %bb11.i.i
%10 = bitcast %"alloc::string::String"* %value.i.i.i to i8*
call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %10), !noalias !25
; call thread_local::init
call fastcc void @_ZN12thread_local4init17h08fde1884c30376bE(%"alloc::string::String"* noalias nocapture nonnull dereferenceable(24) %value.i.i.i) #6, !noalias !25
%11 = bitcast %"alloc::string::String"* %value.i.i.i to <2 x i64>*
%12 = load <2 x i64>, <2 x i64>* %11, align 16, !noalias !25
%_11.sroa.0.sroa.5.0..sroa_idx21.i.i.i = getelementptr inbounds %"alloc::string::String", %"alloc::string::String"* %value.i.i.i, i64 0, i32 1, i32 3
%_11.sroa.0.sroa.5.0.copyload.i.i.i = load i64, i64* %_11.sroa.0.sroa.5.0..sroa_idx21.i.i.i, align 16, !noalias !25
%z.i.i.i.sroa.0.0.copyload.i.i.i = load {}*, {}** %7, align 8, !noalias !30
%13 = getelementptr inbounds %"std::thread::local::fast::Key<alloc::string::String>", %"std::thread::local::fast::Key<alloc::string::String>"* %self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i, i64 0, i32 1, i32 1, i32 2, i64 0
%z.i.i.i.sroa.4.0.copyload.i.i.i = load i64, i64* %13, align 8, !noalias !30
%14 = getelementptr inbounds %"std::thread::local::fast::Key<alloc::string::String>", %"std::thread::local::fast::Key<alloc::string::String>"* %self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i, i64 0, i32 1, i32 1, i32 2, i64 1
%15 = bitcast %"std::thread::local::fast::Key<alloc::string::String>"* %self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i to <2 x i64>*
store <2 x i64> %12, <2 x i64>* %15, align 8, !noalias !34
store i64 %_11.sroa.0.sroa.5.0.copyload.i.i.i, i64* %14, align 8, !noalias !34
%16 = icmp eq {}* %z.i.i.i.sroa.0.0.copyload.i.i.i, null
%17 = icmp eq i64 %z.i.i.i.sroa.4.0.copyload.i.i.i, 0
%or.cond.i.i.i = or i1 %16, %17
br i1 %or.cond.i.i.i, label %"_ZN3std6thread5local17LocalKey$LT$T$GT$4init17h4a7d4a29fdf48028E.exit.i.i", label %bb4.i.i.i.i.i.i.i.i.i
bb4.i.i.i.i.i.i.i.i.i: ; preds = %bb13.i.i
%18 = bitcast {}* %z.i.i.i.sroa.0.0.copyload.i.i.i to i8*
tail call void @__rust_dealloc(i8* nonnull %18, i64 %z.i.i.i.sroa.4.0.copyload.i.i.i, i64 1) #6, !noalias !25
br label %"_ZN3std6thread5local17LocalKey$LT$T$GT$4init17h4a7d4a29fdf48028E.exit.i.i"
"_ZN3std6thread5local17LocalKey$LT$T$GT$4init17h4a7d4a29fdf48028E.exit.i.i": ; preds = %bb4.i.i.i.i.i.i.i.i.i, %bb13.i.i
call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %10), !noalias !25
br label %bb15.i.i
bb15.i.i: ; preds = %"_ZN3std6thread5local17LocalKey$LT$T$GT$4init17h4a7d4a29fdf48028E.exit.i.i", %bb11.i.i
%_19.0.i.i = bitcast %"std::thread::local::fast::Key<alloc::string::String>"* %self.i.i.i.i.0.self.i.i.i.0.self.i.i.0.self.i.0.self.0..i.i.i.i to %"alloc::string::String"*
; call <alloc::string::String as core::clone::Clone>::clone
call void @"_ZN60_$LT$alloc..string..String$u20$as$u20$core..clone..Clone$GT$5clone17h9234dcb674122143E"(%"alloc::string::String"* noalias nocapture nonnull sret dereferenceable(24) %_15.i.i, %"alloc::string::String"* noalias nonnull readonly align 8 dereferenceable(24) %_19.0.i.i), !noalias !25
%_3.sroa.0.0..sroa_cast.i = bitcast %"alloc::string::String"* %_15.i.i to {}**
%_3.sroa.0.0.copyload.i = load {}*, {}** %_3.sroa.0.0..sroa_cast.i, align 8, !noalias !35
%_3.sroa.6.0..sroa_idx.i = getelementptr inbounds %"alloc::string::String", %"alloc::string::String"* %_15.i.i, i64 0, i32 1, i32 1, i32 1
%_3.sroa.6.0..sroa_cast.i = bitcast i64* %_3.sroa.6.0..sroa_idx.i to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %_3.sroa.6.0.sroa_cast9.i, i8* nonnull align 8 %_3.sroa.6.0..sroa_cast.i, i64 16, i1 false), !noalias !35
call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %6), !noalias !25
%19 = icmp eq {}* %_3.sroa.0.0.copyload.i, null
br i1 %19, label %bb4.i.i, label %"_ZN3std6thread5local17LocalKey$LT$T$GT$4with17h0b7fbfbefd581825E.exit"
bb4.i.i: ; preds = %bb15.i.i, %"_ZN3std6thread5local17LocalKey$LT$T$GT$8try_with17ha49ffcfe07458aa0E.exit.thread.i"
; call core::result::unwrap_failed
tail call fastcc void @_ZN4core6result13unwrap_failed17hcfbdd175a3d3f832E(), !noalias !36
unreachable
"_ZN3std6thread5local17LocalKey$LT$T$GT$4with17h0b7fbfbefd581825E.exit": ; preds = %bb15.i.i
%_3.sroa.0.0..sroa_cast2.i = bitcast %"alloc::string::String"* %0 to {}**
store {}* %_3.sroa.0.0.copyload.i, {}** %_3.sroa.0.0..sroa_cast2.i, align 8, !alias.scope !36
%_3.sroa.6.0..sroa_idx6.i = getelementptr inbounds %"alloc::string::String", %"alloc::string::String"* %0, i64 0, i32 1, i32 1, i32 1
%_3.sroa.6.0..sroa_cast7.i = bitcast i64* %_3.sroa.6.0..sroa_idx6.i to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 8 %_3.sroa.6.0..sroa_cast7.i, i8* nonnull align 8 %_3.sroa.6.0.sroa_cast9.i, i64 16, i1 false), !alias.scope !40
call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %_3.sroa.6.0.sroa_cast9.i)
ret void
} I can only guess at what's going on at the LLVM layer, and unfortunately I don't know much about LLVM's inner workings or where to look for tweaking codgen in rustc. The actual fast thread local asm without `read_volatile`__ZN12thread_local20get_the_thread_local17ha0569ef21dbb73eaE:
pushq %rbp
movq %rsp, %rbp
pushq %r14
pushq %rbx
subq $64, %rsp
movq %rdi, %rbx
movq __ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h5103305ae64a8553E@TLVP(%rip), %rdi
callq *(%rdi)
cmpb $0, 25(%rax)
jne LBB4_9
movq __ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h5103305ae64a8553E@TLVP(%rip), %rdi
callq *(%rdi)
cmpb $0, 24(%rax)
jne LBB4_3
movq __ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h5103305ae64a8553E@TLVP(%rip), %rdi
callq *(%rdi)
movq %rax, %r14
leaq __ZN3std6thread5local4fast13destroy_value17h2b601cce828bc311E(%rip), %rsi
movq %rax, %rdi
callq __ZN3std3sys4unix17fast_thread_local13register_dtor17hea5fd2f6f1030afcE
movb $1, 24(%r14)
LBB4_3:
movq __ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h5103305ae64a8553E@TLVP(%rip), %rdi
callq *(%rdi)
cmpq $0, (%rax)
je LBB4_4
LBB4_7:
movq __ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h5103305ae64a8553E@TLVP(%rip), %rdi
callq *(%rdi)
leaq -48(%rbp), %rdi
movq %rax, %rsi
callq __ZN60_$LT$alloc..string..String$u20$as$u20$core..clone..Clone$GT$5clone17h5e0882d4896c9b5eE
movq -48(%rbp), %rax
movq -40(%rbp), %rcx
movq %rcx, -64(%rbp)
movq -32(%rbp), %rcx
movq %rcx, -56(%rbp)
testq %rax, %rax
je LBB4_9
movq %rax, (%rbx)
movq -64(%rbp), %rax
movq -56(%rbp), %rcx
movq %rcx, 16(%rbx)
movq %rax, 8(%rbx)
movq %rbx, %rax
addq $64, %rsp
popq %rbx
popq %r14
popq %rbp
retq
LBB4_4:
leaq -48(%rbp), %rdi
callq __ZN12thread_local4init17h71551753bba23e2bE
movaps -48(%rbp), %xmm0
movaps %xmm0, -80(%rbp)
movq -32(%rbp), %rcx
movq __ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h5103305ae64a8553E@TLVP(%rip), %rdi
callq *(%rdi)
movq (%rax), %rdi
movq 8(%rax), %rsi
movaps -80(%rbp), %xmm0
movups %xmm0, (%rax)
movq %rcx, 16(%rax)
testq %rdi, %rdi
je LBB4_7
testq %rsi, %rsi
je LBB4_7
movl $1, %edx
callq ___rust_dealloc
jmp LBB4_7
LBB4_9:
callq __ZN4core6result13unwrap_failed17ha41e6baa6f090bd5E It's also interesting that the IR for a linux release build is the same, but the assembly only performs linux asm_ZN12thread_local20get_the_thread_local17h53234738c97e2accE:
pushq %r15
pushq %r14
pushq %rbx
subq $48, %rsp
movq %rdi, %r14
leaq _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@TLSLD(%rip), %rdi
callq __tls_get_addr@PLT
cmpb $0, _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF+25(%rax)
jne .LBB4_8
movq %rax, %rbx
cmpb $0, _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF+24(%rbx)
jne .LBB4_3
movq %rbx, %rax
movq %rax, %r15
leaq _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF(%rax), %rdi
leaq _ZN3std6thread5local4fast13destroy_value17h5fc509333a064006E(%rip), %rsi
callq *_ZN3std3sys4unix17fast_thread_local13register_dtor17h7d5b1813798466baE@GOTPCREL(%rip)
movb $1, _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF+24(%r15)
.LBB4_3:
movq %rbx, %rax
cmpq $0, _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF(%rbx)
je .LBB4_4
.LBB4_7:
leaq _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF(%rbx), %rsi
movq %rsp, %rdi
callq *_ZN60_$LT$alloc..string..String$u20$as$u20$core..clone..Clone$GT$5clone17h5526323efb1899c4E@GOTPCREL(%rip)
movq (%rsp), %rax
movups 8(%rsp), %xmm0
movaps %xmm0, 32(%rsp)
testq %rax, %rax
je .LBB4_8
movq %rax, (%r14)
movaps 32(%rsp), %xmm0
movups %xmm0, 8(%r14)
movq %r14, %rax
addq $48, %rsp
popq %rbx
popq %r14
popq %r15
retq
.LBB4_4:
movq %rsp, %rdi
callq _ZN12thread_local4init17h8519144e0d22f3f8E
movaps (%rsp), %xmm0
movq 16(%rsp), %rcx
movq %rbx, %rax
movq _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF(%rbx), %rdi
movq _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF+8(%rbx), %rsi
movaps %xmm0, _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF(%rbx)
movq %rcx, _ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hde48327e2980a364E@DTPOFF+16(%rbx)
testq %rdi, %rdi
je .LBB4_7
testq %rsi, %rsi
je .LBB4_7
movl $1, %edx
callq *__rust_dealloc@GOTPCREL(%rip)
jmp .LBB4_7
.LBB4_8:
callq _ZN4core6result13unwrap_failed17h1d58e680aff0dbf0E
ud2
Any ideas/tips for fixing the codegen, or thoughts on where to look in the source? I am pretty motivated to get something done on this. |
Ok thanks for the info! I think I see what's going on here, and this is arguably an LLVM bug I think. What appears to be happening is that LLVM's optimizing for raw accesses to the global, so it's optimizing as much as possible to reference items through the global rather than through some intermediate value (because then it has more knowledge about what's happening). On Linux I believe TLS is a two-level system where you call some fancy function to get the base address and then you offset from that to get the actual item. It looks like LLVM is optimized at the backend to understand that the base address is memoizable so it only needs to call that function once rather than once-per-reference. On OSX, however, it looks like TLS works differently where it's a dynamic function call to get the address of each variable (I think?). It looks like LLVM isn't realizing that it's probably always the same address and as a result is calling the function once per reference. Note though that the use of So at it's root it'd be probably good to report this bug to upstream LLVM. This reproduction I believe shows the issue where the Otherwise though it seems reasonable to fix this issue otherwise on our side of things in the meantime. I'd lean towards a solution without volatile reads though to ensure that we don't accidentally break other optimizations others are relying on with thread locals. Would it be possible to perhaps reorganize the code such that it only references the address of the thread local once? This would likely involve some trickery to ensure LLVM is convinced to only reference the global once, but there may be some cleverness we could restructure with raw pointers or something like that maybe? If it's not possible without volatile though I don't know of anything concrete that gets pessimized with volatile loads so I think we should probably go ahead with the PR as-is. Just figure it's good to try without the volatile first and see how far we can go! It'd also be great to file an upstream LLVM bug and reference that in the code where the volatile read happens if that stays. |
@alexcrichton Thank you for all the information, and the repro! Everything you said makes perfect sense. I'll file an LLVM bug report, attempt a more optimizer friendly version of the final commit, update this PR and ping you when there's something worth reviewing. I have some ideas for working around the misoptimization by packing all the metadata about the thread local into a single word. Instead of an enum State {
Uninitialized,
..
DtorRunning,
} or similar packing would reduce the number of offsets that need to be read on every call to |
That sounds reasonable to me yeah, and will probably be a win across the board for reorganizing the internals :) |
e2f2248
to
fd69db3
Compare
The job Click to expand the log.
I'm a bot! I can only do what humans tell me to, so if this was not helpful or you have suggestions for improvements, please ping or otherwise contact |
fd69db3
to
619ab84
Compare
The job Click to expand the log.
I'm a bot! I can only do what humans tell me to, so if this was not helpful or you have suggestions for improvements, please ping or otherwise contact |
The job Click to expand the log.
I'm a bot! I can only do what humans tell me to, so if this was not helpful or you have suggestions for improvements, please ping or otherwise contact |
@alexcrichton I have something working. Unfortunately we need at least 1 byte (except in match &self.opt { // calculate address of discriminant
Some(ref val) => Some(val), // calculate address of value storage
None => self.slow_path(), // returns Option<&'static T>
} However, when With that in mind, here's what a simpler version of the original example now looks like ( pub fn get_the_thread_local() -> usize {
thread_local! {
static X: String = String::from("hello world. this string is long");
}
X.with(|x| x.len())
} LLVM-IR ; thread_local::get_the_thread_local
; Function Attrs: uwtable
define i64 @_ZN12thread_local20get_the_thread_local17h9b1ad8597fd10cadE() unnamed_addr #4 personality i32 (i32, i32, i64, %"unwind::libunwind::_Unwind_Exception"*, %"unwind::libunwind::_Unwind_Context"*)* @rust_eh_personality {
start:
%0 = load {}*, {}** bitcast (<{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h6da2233c675cfa90E to {}**), align 8, !alias.scope !51
%1 = icmp eq {}* %0, null
br i1 %1, label %_ZN12thread_local20get_the_thread_local1X7__getit17h0fb9551d9b330632E.exit.i.i, label %"_ZN3std6thread5local17LocalKey$LT$T$GT$4with17ha7d145a93e39ed07E.exit"
_ZN12thread_local20get_the_thread_local1X7__getit17h0fb9551d9b330632E.exit.i.i: ; preds = %start
; call std::thread::local::fast::Key<T>::try_initialize_drop
%2 = tail call fastcc align 8 dereferenceable_or_null(24) i64* @"_ZN3std6thread5local4fast12Key$LT$T$GT$19try_initialize_drop17hdbbedea7d28cfb9fE"()
%3 = icmp eq i64* %2, null
br i1 %3, label %bb4.i.i, label %"_ZN3std6thread5local17LocalKey$LT$T$GT$4with17ha7d145a93e39ed07E.exit"
bb4.i.i: ; preds = %_ZN12thread_local20get_the_thread_local1X7__getit17h0fb9551d9b330632E.exit.i.i
; call core::result::unwrap_failed
tail call fastcc void @_ZN4core6result13unwrap_failed17h73420553132c6252E()
unreachable
"_ZN3std6thread5local17LocalKey$LT$T$GT$4with17ha7d145a93e39ed07E.exit": ; preds = %start, %_ZN12thread_local20get_the_thread_local1X7__getit17h0fb9551d9b330632E.exit.i.i
%_0.0.i.i1.i.i = phi i64* [ %2, %_ZN12thread_local20get_the_thread_local1X7__getit17h0fb9551d9b330632E.exit.i.i ], [ bitcast (<{ [32 x i8] }>* @_ZN12thread_local20get_the_thread_local1X7__getit5__KEY17h6da2233c675cfa90E to i64*), %start ]
%4 = getelementptr i64, i64* %_0.0.i.i1.i.i, i64 2
%.idx.val.i.i = load i64, i64* %4, align 8, !alias.scope !54
ret i64 %.idx.val.i.i
} asm __ZN12thread_local20get_the_thread_local17h3ebe4a7ff0ca9672E:
pushq %rax
movq __ZN12thread_local20get_the_thread_local1X7__getit5__KEY17hcd2875a690ec9ddbE@TLVP(%rip), %rdi
callq *(%rdi)
cmpq $0, (%rax)
je LBB10_1
LBB10_2:
movq 16(%rax), %rax
popq %rcx
retq
LBB10_1:
callq __ZN3std6thread5local4fast12Key$LT$T$GT$19try_initialize_drop17hb0e61de580a0b1eeE
testq %rax, %rax
jne LBB10_2
callq __ZN4core6result13unwrap_failed17h6d9d014b9b2876d6E The final asm is now very tight for the fast path. This is likely a win for other platforms as well.
The
The The only other ways I found of reorganizing the code that resulted in 1x Looking forward to reading your feedback on this. I would understand if these changes are too drastic. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice wins! The new structure looks great to me. Could you place a comment somewhere that it's somewhat carefully crafted to ensure that the address of the thread local is read only once due to the LLVM bug? We probably can't really easily add a codegen test for this or verify it doesn't regress over time, but that's also fine since this doesn't really change all that often.
94529bf
to
5adfe47
Compare
The job Click to expand the log.
I'm a bot! I can only do what humans tell me to, so if this was not helpful or you have suggestions for improvements, please ping or otherwise contact |
pub unsafe fn get<F: FnOnce() -> T>(&self, init: F) -> Option<&'static T> { | ||
match self.inner.get() { | ||
Some(val) => Some(val), | ||
None => self.try_initialize(init), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So looking at this again, I'm just trying to follow the branching here and I'm thinking now, would this produce the same codegen?
match self.inner.get() {
Some(val) => Some(val),
None => {
if mem::needs_drop::<T>() {
self.register_dtor()?;
}
Some(self.inner.initialize(init))
}
}
where register_dtor
is basically what try_initialize_drop
is right now except it's missing the last self.inner.initialize
.
I'd want to be careful to not tamper with the codegen too much though. Is #[cold]
on try_initialize
needed for that? If so, then could try_initialize
be updated to have a similar structure to the match
I'm thinking of?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAICT self.inner.initialize
has to be behind a #[inline(never)]
function or else there's two tlv_get_addr
calls on macos. #[cold]
is optional, but seems to generate better asm IMO.
Putting the needs_drop
check behind an inline(never)
like so:
#[inline(never)]
#[cold]
unsafe fn try_initialize<F: FnOnce() -> T>(&self, init: F) -> Option<&'static T> {
if !mem::needs_drop::<T>() || self.try_register_dtor() {
Some(self.inner.initialize(init))
} else {
None
}
}
causes unwrapping code to be generated for LocalKey::with
, even for cases which always return Some
(needs_drop::<T>() == false
). I'll poke a little more at this on the weekend, but it is tricky to merge drop/nodrop paths together without hurting one or the other.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok I think #[cold]
is definitely an easy win (and one we should do here).
If #[inline(never)]
is only on try_register_dtor
(this hypothetical function) I think that the codegen should be basically the same as this PR currently is, meaning no unwrapping code but still two accesses perhaps?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The generated code does come out to be the same! I think that's a pretty good win.
Moreover, I was wrong about needing #[inline(never)]
anywhere. Only using #[cold]
and only on try_initialize
gives pretty equivalent results. Sometimes two accesses (try_initialize
is inlined), sometimes 1 (try_initialize
is not inlined).
@alexcrichton Thanks for help and time spent on this PR! I pushed up the inlining/refactor changes.
Ok I've got one tiny nit about code organization, but other than that r=me. I didn't mean to cause this to go up to two address lookups though so if you want to restore to the previous code that only had one access that's also ok. So long as it's documented as a bit funky because we're minimizing accesses that's fine by me! |
Shouldn't this go through perf.rlo first? |
Isn't perf Linux only? |
Unless I'm blind it changes generic code. |
This reverts commit d252f3b77f3b7d4cd59620588f9d026633c05816.
💔 Test failed - checks-travis |
The job Click to expand the log.
I'm a bot! I can only do what humans tell me to, so if this was not helpful or you have suggestions for improvements, please ping or otherwise contact |
@bors r- |
Ping from triage @mtak-, I think there's still CI failures to be addressed? |
@jonas-schievink Yeah! Sorry for letting this hang. I'll look into the failures tonight (PST). |
@jonas-schievink Should be good to go now - barring other CI failures. |
Great! I don't really understand the changed test, so I'll defer to the reviewer for one last check-in. |
@bors: r+ |
📌 Commit b148c25 has been approved by |
macos tlv workaround fixes: #60141 Includes: * remove dead code: `requires_move_before_drop`. This hasn't been needed for a while now (oops I should have removed it in #57655) * redox had a copy of `fast::Key` (not sure why?). That has been removed. * Perform a `read_volatile` on OSX to reduce `tlv_get_addr` calls per `__getit` from (4-2 depending on context) to 1. `tlv_get_addr` is relatively expensive (~1.5ns on my machine). Previously, in contexts where `__getit` was inlined, 4 calls to `tlv_get_addr` were performed per lookup. For some reason when `__getit` is not inlined this is reduced to 2x - and performance improves to match. After this PR, I have only ever seen 1x call to `tlv_get_addr` per `__getit`, and macos now benefits from situations where `__getit` is inlined. I'm not sure if the `read_volatile(&&__KEY)` trick is working around an LLVM bug, or a rustc bug, or neither. r? @alexcrichton
It would be better if this PR is squashed appropriately (But don't after it is testing). |
☀️ Test successful - checks-travis, status-appveyor |
|
That might be giving this PR too much credit/blame. Here's a more narrow time range showing tiny improvements across the board, but mostly noise: https://perf.rust-lang.org/compare.html?start=7d107613498c500699dcf89aab7231c28a8fa3af&end=3c805ce183840bd20bd81a54a02b3c8b6dccc9dd&stat=instructions%3Au |
Right I messed up. |
fixes: #60141
Includes:
requires_move_before_drop
. This hasn't been needed for a while now (oops I should have removed it in OSX: fix #57534 registering thread dtors while running thread dtors #57655)fast::Key
(not sure why?). That has been removed.read_volatile
on OSX to reducetlv_get_addr
calls per__getit
from (4-2 depending on context) to 1.tlv_get_addr
is relatively expensive (~1.5ns on my machine).Previously, in contexts where
__getit
was inlined, 4 calls totlv_get_addr
were performed per lookup. For some reason when__getit
is not inlined this is reduced to 2x - and performance improves to match.After this PR, I have only ever seen 1x call to
tlv_get_addr
per__getit
, and macos now benefits from situations where__getit
is inlined.I'm not sure if the
read_volatile(&&__KEY)
trick is working around an LLVM bug, or a rustc bug, or neither.r? @alexcrichton