From 77f9b2728eb08456899e6500328e00ec4829dddf Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 14 Sep 2018 09:57:06 +0200 Subject: [PATCH] runtime: use MADV_FREE on Linux if available On Linux, sysUnused currently uses madvise(MADV_DONTNEED) to signal the kernel that a range of allocated memory contains unneeded data. After a successful call, the range (but not the data it contained before the call to madvise) is still available but the first access to that range will unconditionally incur a page fault (needed to 0-fill the range). A faster alternative is MADV_FREE, available since Linux 4.5. The mechanism is very similar, but the page fault will only be incurred if the kernel, between the call to madvise and the first access, decides to reuse that memory for something else. In sysUnused, test whether MADV_FREE is supported and fall back to MADV_DONTNEED in case it isn't. This requires making the return value of the madvise syscall available to the caller, so change runtime.madvise to return it. Fixes #23687 Change-Id: I962c3429000dd9f4a00846461ad128b71201bb04 Reviewed-on: https://go-review.googlesource.com/135395 Run-TryBot: Tobias Klauser TryBot-Result: Gobot Gobot Reviewed-by: Ian Lance Taylor --- src/runtime/defs2_linux.go | 5 ++++- src/runtime/defs_linux.go | 5 ++++- src/runtime/defs_linux_386.go | 1 + src/runtime/defs_linux_amd64.go | 1 + src/runtime/defs_linux_arm.go | 1 + src/runtime/defs_linux_arm64.go | 1 + src/runtime/defs_linux_mips64x.go | 1 + src/runtime/defs_linux_mipsx.go | 1 + src/runtime/defs_linux_ppc64.go | 1 + src/runtime/defs_linux_ppc64le.go | 1 + src/runtime/defs_linux_s390x.go | 1 + src/runtime/mem_linux.go | 13 +++++++++++-- src/runtime/stubs2.go | 3 ++- src/runtime/sys_dragonfly_amd64.s | 6 ++++-- src/runtime/sys_freebsd_386.s | 4 +++- src/runtime/sys_freebsd_amd64.s | 6 ++++-- src/runtime/sys_freebsd_arm.s | 15 ++++++++------- src/runtime/sys_linux_386.s | 2 +- src/runtime/sys_linux_amd64.s | 2 +- src/runtime/sys_linux_arm.s | 2 +- src/runtime/sys_linux_arm64.s | 2 +- src/runtime/sys_linux_mips64x.s | 2 +- src/runtime/sys_linux_mipsx.s | 4 ++-- src/runtime/sys_linux_ppc64x.s | 2 +- src/runtime/sys_linux_s390x.s | 2 +- src/runtime/sys_netbsd_386.s | 4 +++- src/runtime/sys_netbsd_amd64.s | 4 +++- src/runtime/sys_netbsd_arm.s | 11 ++++++----- src/runtime/sys_openbsd_386.s | 3 ++- src/runtime/sys_openbsd_amd64.s | 4 +++- src/runtime/sys_openbsd_arm.s | 4 ++-- 31 files changed, 77 insertions(+), 37 deletions(-) diff --git a/src/runtime/defs2_linux.go b/src/runtime/defs2_linux.go index c10dfb86240002..b08c0dafe12f2f 100644 --- a/src/runtime/defs2_linux.go +++ b/src/runtime/defs2_linux.go @@ -58,7 +58,10 @@ const ( MAP_PRIVATE = C.MAP_PRIVATE MAP_FIXED = C.MAP_FIXED - MADV_DONTNEED = C.MADV_DONTNEED + MADV_DONTNEED = C.MADV_DONTNEED + MADV_FREE = C.MADV_FREE + MADV_HUGEPAGE = C.MADV_HUGEPAGE + MADV_NOHUGEPAGE = C.MADV_HNOUGEPAGE SA_RESTART = C.SA_RESTART SA_ONSTACK = C.SA_ONSTACK diff --git a/src/runtime/defs_linux.go b/src/runtime/defs_linux.go index 553366a50ba6c5..2d810136d98713 100644 --- a/src/runtime/defs_linux.go +++ b/src/runtime/defs_linux.go @@ -47,7 +47,10 @@ const ( MAP_PRIVATE = C.MAP_PRIVATE MAP_FIXED = C.MAP_FIXED - MADV_DONTNEED = C.MADV_DONTNEED + MADV_DONTNEED = C.MADV_DONTNEED + MADV_FREE = C.MADV_FREE + MADV_HUGEPAGE = C.MADV_HUGEPAGE + MADV_NOHUGEPAGE = C.MADV_HNOUGEPAGE SA_RESTART = C.SA_RESTART SA_ONSTACK = C.SA_ONSTACK diff --git a/src/runtime/defs_linux_386.go b/src/runtime/defs_linux_386.go index a7e435f854fe3f..0ebac17aefa9e8 100644 --- a/src/runtime/defs_linux_386.go +++ b/src/runtime/defs_linux_386.go @@ -18,6 +18,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/defs_linux_amd64.go b/src/runtime/defs_linux_amd64.go index e8c6a212db770b..c0a0ef0dd4ec50 100644 --- a/src/runtime/defs_linux_amd64.go +++ b/src/runtime/defs_linux_amd64.go @@ -18,6 +18,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/defs_linux_arm.go b/src/runtime/defs_linux_arm.go index 62ec8fab5e9b55..43946bb79ca8e5 100644 --- a/src/runtime/defs_linux_arm.go +++ b/src/runtime/defs_linux_arm.go @@ -16,6 +16,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/defs_linux_arm64.go b/src/runtime/defs_linux_arm64.go index c295bc02575209..c2cc281ab4f5c2 100644 --- a/src/runtime/defs_linux_arm64.go +++ b/src/runtime/defs_linux_arm64.go @@ -18,6 +18,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/defs_linux_mips64x.go b/src/runtime/defs_linux_mips64x.go index df11cb0965d69c..9dacd5d1e9bacb 100644 --- a/src/runtime/defs_linux_mips64x.go +++ b/src/runtime/defs_linux_mips64x.go @@ -18,6 +18,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/defs_linux_mipsx.go b/src/runtime/defs_linux_mipsx.go index 702fbb51c861c4..9532ac54ee83da 100644 --- a/src/runtime/defs_linux_mipsx.go +++ b/src/runtime/defs_linux_mipsx.go @@ -22,6 +22,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/defs_linux_ppc64.go b/src/runtime/defs_linux_ppc64.go index 45363d12854bec..5a4326da07a94e 100644 --- a/src/runtime/defs_linux_ppc64.go +++ b/src/runtime/defs_linux_ppc64.go @@ -18,6 +18,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/defs_linux_ppc64le.go b/src/runtime/defs_linux_ppc64le.go index 45363d12854bec..5a4326da07a94e 100644 --- a/src/runtime/defs_linux_ppc64le.go +++ b/src/runtime/defs_linux_ppc64le.go @@ -18,6 +18,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/defs_linux_s390x.go b/src/runtime/defs_linux_s390x.go index ab90723f754858..a6cc9c48e91de2 100644 --- a/src/runtime/defs_linux_s390x.go +++ b/src/runtime/defs_linux_s390x.go @@ -19,6 +19,7 @@ const ( _MAP_FIXED = 0x10 _MADV_DONTNEED = 0x4 + _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go index 7aa48170a1164d..845f72ded2c163 100644 --- a/src/runtime/mem_linux.go +++ b/src/runtime/mem_linux.go @@ -5,6 +5,7 @@ package runtime import ( + "runtime/internal/atomic" "runtime/internal/sys" "unsafe" ) @@ -34,10 +35,12 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer { return p } +var adviseUnused = uint32(_MADV_FREE) + func sysUnused(v unsafe.Pointer, n uintptr) { // By default, Linux's "transparent huge page" support will // merge pages into a huge page if there's even a single - // present regular page, undoing the effects of the DONTNEED + // present regular page, undoing the effects of madvise(adviseUnused) // below. On amd64, that means khugepaged can turn a single // 4KB page to 2MB, bloating the process's RSS by as much as // 512X. (See issue #8832 and Linux kernel bug @@ -102,7 +105,13 @@ func sysUnused(v unsafe.Pointer, n uintptr) { throw("unaligned sysUnused") } - madvise(v, n, _MADV_DONTNEED) + advise := atomic.Load(&adviseUnused) + if errno := madvise(v, n, int32(advise)); advise == _MADV_FREE && errno != 0 { + // MADV_FREE was added in Linux 4.5. Fall back to MADV_DONTNEED if it is + // not supported. + atomic.Store(&adviseUnused, _MADV_DONTNEED) + madvise(v, n, _MADV_DONTNEED) + } } func sysUsed(v unsafe.Pointer, n uintptr) { diff --git a/src/runtime/stubs2.go b/src/runtime/stubs2.go index 02249d0aadc671..c14db74003fe2f 100644 --- a/src/runtime/stubs2.go +++ b/src/runtime/stubs2.go @@ -25,7 +25,8 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32 //go:noescape func open(name *byte, mode, perm int32) int32 -func madvise(addr unsafe.Pointer, n uintptr, flags int32) +// return value is only set on linux to be used in osinit() +func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32 // exitThread terminates the current thread, writing *wait = 0 when // the stack is safe to reclaim. diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s index f0eb5f4e216149..b18e9676513d7e 100644 --- a/src/runtime/sys_dragonfly_amd64.s +++ b/src/runtime/sys_dragonfly_amd64.s @@ -260,9 +260,11 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVL flags+16(FP), DX MOVQ $75, AX // madvise SYSCALL - // ignore failure - maybe pages are locked + JCC 2(PC) + MOVL $-1, AX + MOVL AX, ret+24(FP) RET - + TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 MOVQ new+0(FP), DI MOVQ old+8(FP), SI diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s index b8f685a32374a7..754689ba0558b1 100644 --- a/src/runtime/sys_freebsd_386.s +++ b/src/runtime/sys_freebsd_386.s @@ -163,7 +163,9 @@ TEXT runtime·munmap(SB),NOSPLIT,$-4 TEXT runtime·madvise(SB),NOSPLIT,$-4 MOVL $75, AX // madvise INT $0x80 - // ignore failure - maybe pages are locked + JAE 2(PC) + MOVL $-1, AX + MOVL AX, ret+12(FP) RET TEXT runtime·setitimer(SB), NOSPLIT, $-4 diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s index be191a078458fc..55959b3e3a333a 100644 --- a/src/runtime/sys_freebsd_amd64.s +++ b/src/runtime/sys_freebsd_amd64.s @@ -337,9 +337,11 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVL flags+16(FP), DX MOVQ $75, AX // madvise SYSCALL - // ignore failure - maybe pages are locked + JCC 2(PC) + MOVL $-1, AX + MOVL AX, ret+24(FP) RET - + TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 MOVQ new+0(FP), DI MOVQ old+8(FP), SI diff --git a/src/runtime/sys_freebsd_arm.s b/src/runtime/sys_freebsd_arm.s index 93bf569367e922..f347b9fa961b99 100644 --- a/src/runtime/sys_freebsd_arm.s +++ b/src/runtime/sys_freebsd_arm.s @@ -264,14 +264,15 @@ TEXT runtime·munmap(SB),NOSPLIT,$0 RET TEXT runtime·madvise(SB),NOSPLIT,$0 - MOVW addr+0(FP), R0 // arg 1 addr - MOVW n+4(FP), R1 // arg 2 len - MOVW flags+8(FP), R2 // arg 3 flags - MOVW $SYS_madvise, R7 - SWI $0 - // ignore failure - maybe pages are locked + MOVW addr+0(FP), R0 // arg 1 addr + MOVW n+4(FP), R1 // arg 2 len + MOVW flags+8(FP), R2 // arg 3 flags + MOVW $SYS_madvise, R7 + SWI $0 + MOVW.CS $-1, R0 + MOVW R0, ret+12(FP) RET - + TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0 MOVW new+0(FP), R0 MOVW old+4(FP), R1 diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s index 4e914f3e6013d0..40b55a67eb8799 100644 --- a/src/runtime/sys_linux_386.s +++ b/src/runtime/sys_linux_386.s @@ -427,7 +427,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVL n+4(FP), CX MOVL flags+8(FP), DX INVOKE_SYSCALL - // ignore failure - maybe pages are locked + MOVL AX, ret+12(FP) RET // int32 futex(int32 *uaddr, int32 op, int32 val, diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index 4492dad02e7a64..7e846371e5620c 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -519,7 +519,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVL flags+16(FP), DX MOVQ $SYS_madvise, AX SYSCALL - // ignore failure - maybe pages are locked + MOVL AX, ret+24(FP) RET // int64 futex(int32 *uaddr, int32 op, int32 val, diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s index a709c4cbd050b7..43a58335c8068a 100644 --- a/src/runtime/sys_linux_arm.s +++ b/src/runtime/sys_linux_arm.s @@ -195,7 +195,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVW flags+8(FP), R2 MOVW $SYS_madvise, R7 SWI $0 - // ignore failure - maybe pages are locked + MOVW R0, ret+12(FP) RET TEXT runtime·setitimer(SB),NOSPLIT,$0 diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s index 086c8ddc637fe3..8b344be8f83b9e 100644 --- a/src/runtime/sys_linux_arm64.s +++ b/src/runtime/sys_linux_arm64.s @@ -401,7 +401,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 MOVW flags+16(FP), R2 MOVD $SYS_madvise, R8 SVC - // ignore failure - maybe pages are locked + MOVW R0, ret+24(FP) RET // int64 futex(int32 *uaddr, int32 op, int32 val, diff --git a/src/runtime/sys_linux_mips64x.s b/src/runtime/sys_linux_mips64x.s index 337299ba5fe084..c45703d22801c7 100644 --- a/src/runtime/sys_linux_mips64x.s +++ b/src/runtime/sys_linux_mips64x.s @@ -291,7 +291,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 MOVW flags+16(FP), R6 MOVV $SYS_madvise, R2 SYSCALL - // ignore failure - maybe pages are locked + MOVW R2, ret+24(FP) RET // int64 futex(int32 *uaddr, int32 op, int32 val, diff --git a/src/runtime/sys_linux_mipsx.s b/src/runtime/sys_linux_mipsx.s index dca5f1ee459a8f..f362b0f3f1c95d 100644 --- a/src/runtime/sys_linux_mipsx.s +++ b/src/runtime/sys_linux_mipsx.s @@ -302,13 +302,13 @@ TEXT runtime·munmap(SB),NOSPLIT,$0-8 UNDEF // crash RET -TEXT runtime·madvise(SB),NOSPLIT,$0-12 +TEXT runtime·madvise(SB),NOSPLIT,$0-16 MOVW addr+0(FP), R4 MOVW n+4(FP), R5 MOVW flags+8(FP), R6 MOVW $SYS_madvise, R2 SYSCALL - // ignore failure - maybe pages are locked + MOVW R2, ret+12(FP) RET // int32 futex(int32 *uaddr, int32 op, int32 val, struct timespec *timeout, int32 *uaddr2, int32 val2); diff --git a/src/runtime/sys_linux_ppc64x.s b/src/runtime/sys_linux_ppc64x.s index 7c2f8ea6371768..ed79b69257848d 100644 --- a/src/runtime/sys_linux_ppc64x.s +++ b/src/runtime/sys_linux_ppc64x.s @@ -454,7 +454,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 MOVD n+8(FP), R4 MOVW flags+16(FP), R5 SYSCALL $SYS_madvise - // ignore failure - maybe pages are locked + MOVW R3, ret+24(FP) RET // int64 futex(int32 *uaddr, int32 op, int32 val, diff --git a/src/runtime/sys_linux_s390x.s b/src/runtime/sys_linux_s390x.s index 95401af62ecc2e..c79ceea7512f64 100644 --- a/src/runtime/sys_linux_s390x.s +++ b/src/runtime/sys_linux_s390x.s @@ -290,7 +290,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 MOVW flags+16(FP), R4 MOVW $SYS_madvise, R1 SYSCALL - // ignore failure - maybe pages are locked + MOVW R2, ret+24(FP) RET // int64 futex(int32 *uaddr, int32 op, int32 val, diff --git a/src/runtime/sys_netbsd_386.s b/src/runtime/sys_netbsd_386.s index 4042ab4f8abd83..66f4620cab59ad 100644 --- a/src/runtime/sys_netbsd_386.s +++ b/src/runtime/sys_netbsd_386.s @@ -135,7 +135,9 @@ TEXT runtime·munmap(SB),NOSPLIT,$-4 TEXT runtime·madvise(SB),NOSPLIT,$-4 MOVL $75, AX // sys_madvise INT $0x80 - // ignore failure - maybe pages are locked + JAE 2(PC) + MOVL $-1, AX + MOVL AX, ret+12(FP) RET TEXT runtime·setitimer(SB),NOSPLIT,$-4 diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s index 11b9c1b4173753..55236591965cde 100644 --- a/src/runtime/sys_netbsd_amd64.s +++ b/src/runtime/sys_netbsd_amd64.s @@ -319,7 +319,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVL flags+16(FP), DX // arg 3 - behav MOVQ $75, AX // sys_madvise SYSCALL - // ignore failure - maybe pages are locked + JCC 2(PC) + MOVL $-1, AX + MOVL AX, ret+24(FP) RET TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s index 6b2c5a83572c2a..304075f295e31e 100644 --- a/src/runtime/sys_netbsd_arm.s +++ b/src/runtime/sys_netbsd_arm.s @@ -284,11 +284,12 @@ TEXT runtime·munmap(SB),NOSPLIT,$0 RET TEXT runtime·madvise(SB),NOSPLIT,$0 - MOVW addr+0(FP), R0 // arg 1 - addr - MOVW n+4(FP), R1 // arg 2 - len - MOVW flags+8(FP), R2 // arg 3 - behav - SWI $0xa0004b // sys_madvise - // ignore failure - maybe pages are locked + MOVW addr+0(FP), R0 // arg 1 - addr + MOVW n+4(FP), R1 // arg 2 - len + MOVW flags+8(FP), R2 // arg 3 - behav + SWI $0xa0004b // sys_madvise + MOVW.CS $-1, R0 + MOVW R0, ret+12(FP) RET TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0 diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s index 21f13c806e6351..8e34ab497afe15 100644 --- a/src/runtime/sys_openbsd_386.s +++ b/src/runtime/sys_openbsd_386.s @@ -136,7 +136,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$-4 MOVL $75, AX // sys_madvise INT $0x80 JAE 2(PC) - MOVL $0xf1, 0xf1 // crash + MOVL $-1, AX + MOVL AX, ret+12(FP) RET TEXT runtime·setitimer(SB),NOSPLIT,$-4 diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s index 38ac38d9bf1824..227e81869c0f67 100644 --- a/src/runtime/sys_openbsd_amd64.s +++ b/src/runtime/sys_openbsd_amd64.s @@ -305,7 +305,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVL flags+16(FP), DX // arg 3 - behav MOVQ $75, AX // sys_madvise SYSCALL - // ignore failure - maybe pages are locked + JCC 2(PC) + MOVL $-1, AX + MOVL AX, ret+24(FP) RET TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 diff --git a/src/runtime/sys_openbsd_arm.s b/src/runtime/sys_openbsd_arm.s index ff1c1da9b97ee0..52d3638bc18c1f 100644 --- a/src/runtime/sys_openbsd_arm.s +++ b/src/runtime/sys_openbsd_arm.s @@ -143,8 +143,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 MOVW flags+8(FP), R2 // arg 2 - flags MOVW $75, R12 // sys_madvise SWI $0 - MOVW.CS $0, R8 // crash on syscall failure - MOVW.CS R8, (R8) + MOVW.CS $-1, R0 + MOVW R0, ret+12(FP) RET TEXT runtime·setitimer(SB),NOSPLIT,$0