Skip to content

Commit

Permalink
Merge tag 'kvm-x86-vmx-6.3' of https://github.com/kvm-x86/linux into …
Browse files Browse the repository at this point in the history
…HEAD

KVM VMX changes for 6.3:

 - Handle NMI VM-Exits before leaving the noinstr region

 - A few trivial cleanups in the VM-Enter flows

 - Stop enabling VMFUNC for L1 purely to document that KVM doesn't support
   EPTP switching (or any other VM function) for L1

 - Fix a crash when using eVMCS's enlighted MSR bitmaps
  • Loading branch information
bonzini committed Feb 15, 2023
2 parents 4bc6dca + 93827a0 commit 27b025e
Show file tree
Hide file tree
Showing 11 changed files with 143 additions and 127 deletions.
16 changes: 6 additions & 10 deletions arch/x86/include/asm/idtentry.h
Original file line number Diff line number Diff line change
Expand Up @@ -582,18 +582,14 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);

/* NMI */

#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
#if IS_ENABLED(CONFIG_KVM_INTEL)
/*
* Special NOIST entry point for VMX which invokes this on the kernel
* stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
* 'executing' marker.
*
* On 32bit this just uses the regular NMI entry point because 32-bit does
* not have ISTs.
* Special entry point for VMX which invokes this on the kernel stack, even for
* 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work
* correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well
* to avoid more ifdeffery.
*/
DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
#else
#define asm_exc_nmi_noist asm_exc_nmi
DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx);
#endif

DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
Expand Down
8 changes: 4 additions & 4 deletions arch/x86/kernel/nmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -527,14 +527,14 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
mds_user_clear_cpu_buffers();
}

#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
DEFINE_IDTENTRY_RAW(exc_nmi_noist)
#if IS_ENABLED(CONFIG_KVM_INTEL)
DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
{
exc_nmi(regs);
}
#endif
#if IS_MODULE(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
#endif
#endif

void stop_nmi(void)
Expand Down
12 changes: 12 additions & 0 deletions arch/x86/kvm/kvm_cache_regs.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,18 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
}

/*
* kvm_register_test_and_mark_available() is a special snowflake that uses an
* arch bitop directly to avoid the explicit instrumentation that comes with
* the generic bitops. This allows code that cannot be instrumented (noinstr
* functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers.
*/
static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
}

/*
* The "raw" register helpers are only for cases where the full 64 bits of a
* register are read/written irrespective of current vCPU mode. In other words,
Expand Down
31 changes: 10 additions & 21 deletions arch/x86/kvm/vmx/hyperv.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ static __always_inline void evmcs_write64(unsigned long field, u64 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}

static inline void evmcs_write32(unsigned long field, u32 value)
static __always_inline void evmcs_write32(unsigned long field, u32 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
Expand All @@ -208,7 +208,7 @@ static inline void evmcs_write32(unsigned long field, u32 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}

static inline void evmcs_write16(unsigned long field, u16 value)
static __always_inline void evmcs_write16(unsigned long field, u16 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
Expand All @@ -220,7 +220,7 @@ static inline void evmcs_write16(unsigned long field, u16 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}

static inline u64 evmcs_read64(unsigned long field)
static __always_inline u64 evmcs_read64(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);

Expand All @@ -230,7 +230,7 @@ static inline u64 evmcs_read64(unsigned long field)
return *(u64 *)((char *)current_evmcs + offset);
}

static inline u32 evmcs_read32(unsigned long field)
static __always_inline u32 evmcs_read32(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);

Expand All @@ -240,7 +240,7 @@ static inline u32 evmcs_read32(unsigned long field)
return *(u32 *)((char *)current_evmcs + offset);
}

static inline u16 evmcs_read16(unsigned long field)
static __always_inline u16 evmcs_read16(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);

Expand All @@ -250,16 +250,6 @@ static inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}

static inline void evmcs_touch_msr_bitmap(void)
{
if (unlikely(!current_evmcs))
return;

if (current_evmcs->hv_enlightenments_control.msr_bitmap)
current_evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
}

static inline void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
Expand All @@ -274,13 +264,12 @@ static inline void evmcs_load(u64 phys_addr)
void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
static inline void evmcs_write16(unsigned long field, u16 value) {}
static inline u64 evmcs_read64(unsigned long field) { return 0; }
static inline u32 evmcs_read32(unsigned long field) { return 0; }
static inline u16 evmcs_read16(unsigned long field) { return 0; }
static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */

#define EVMPTR_INVALID (-1ULL)
Expand Down
21 changes: 8 additions & 13 deletions arch/x86/kvm/vmx/nested.c
Original file line number Diff line number Diff line change
Expand Up @@ -5864,11 +5864,10 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
u32 function = kvm_rax_read(vcpu);

/*
* VMFUNC is only supported for nested guests, but we always enable the
* secondary control for simplicity; for non-nested mode, fake that we
* didn't by injecting #UD.
* VMFUNC should never execute cleanly while L1 is active; KVM supports
* VMFUNC for nested VMs, but not for L1.
*/
if (!is_guest_mode(vcpu)) {
if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
Expand Down Expand Up @@ -6881,6 +6880,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_TSC_SCALING |
Expand Down Expand Up @@ -6913,18 +6913,13 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_PML;
msrs->ept_caps |= VMX_EPT_AD_BIT;
}
}

if (cpu_has_vmx_vmfunc()) {
msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_VMFUNC;
/*
* Advertise EPTP switching unconditionally
* since we emulate it
* Advertise EPTP switching irrespective of hardware support,
* KVM emulates it in software so long as VMFUNC is supported.
*/
if (enable_ept)
msrs->vmfunc_controls =
VMX_VMFUNC_EPTP_SWITCHING;
if (cpu_has_vmx_vmfunc())
msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
}

/*
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/kvm/vmx/vmcs.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct loaded_vmcs {
struct vmcs_controls_shadow controls_shadow;
};

static inline bool is_intr_type(u32 intr_info, u32 type)
static __always_inline bool is_intr_type(u32 intr_info, u32 type)
{
const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;

Expand Down Expand Up @@ -146,7 +146,7 @@ static inline bool is_icebp(u32 intr_info)
return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
}

static inline bool is_nmi(u32 intr_info)
static __always_inline bool is_nmi(u32 intr_info)
{
return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
}
Expand Down
80 changes: 44 additions & 36 deletions arch/x86/kvm/vmx/vmenter.S
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,39 @@
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif

.macro VMX_DO_EVENT_IRQOFF call_insn call_target
/*
* Unconditionally create a stack frame, getting the correct RSP on the
* stack (for x86-64) would take two instructions anyways, and RBP can
* be used to restore RSP to make objtool happy (see below).
*/
push %_ASM_BP
mov %_ASM_SP, %_ASM_BP

#ifdef CONFIG_X86_64
/*
* Align RSP to a 16-byte boundary (to emulate CPU behavior) before
* creating the synthetic interrupt stack frame for the IRQ/NMI.
*/
and $-16, %rsp
push $__KERNEL_DS
push %rbp
#endif
pushf
push $__KERNEL_CS
\call_insn \call_target

/*
* "Restore" RSP from RBP, even though IRET has already unwound RSP to
* the correct value. objtool doesn't know the callee will IRET and,
* without the explicit restore, thinks the stack is getting walloped.
* Using an unwind hint is problematic due to x86-64's dynamic alignment.
*/
mov %_ASM_BP, %_ASM_SP
pop %_ASM_BP
RET
.endm

.section .noinstr.text, "ax"

/**
Expand Down Expand Up @@ -69,8 +102,8 @@ SYM_FUNC_START(__vmx_vcpu_run)
*/
push %_ASM_ARG2

/* Copy @flags to BL, _ASM_ARG3 is volatile. */
mov %_ASM_ARG3B, %bl
/* Copy @flags to EBX, _ASM_ARG3 is volatile. */
mov %_ASM_ARG3L, %ebx

lea (%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
Expand Down Expand Up @@ -106,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX

/* Check if vmlaunch or vmresume is needed */
testb $VMX_RUN_VMRESUME, %bl
test $VMX_RUN_VMRESUME, %ebx

/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
Expand All @@ -128,7 +161,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX

/* Check EFLAGS.ZF from 'testb' above */
/* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
jz .Lvmlaunch

/*
Expand Down Expand Up @@ -266,6 +299,10 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)

SYM_FUNC_END(__vmx_vcpu_run)

SYM_FUNC_START(vmx_do_nmi_irqoff)
VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
SYM_FUNC_END(vmx_do_nmi_irqoff)


.section .text, "ax"

Expand Down Expand Up @@ -320,35 +357,6 @@ SYM_FUNC_START(vmread_error_trampoline)
SYM_FUNC_END(vmread_error_trampoline)
#endif

SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
/*
* Unconditionally create a stack frame, getting the correct RSP on the
* stack (for x86-64) would take two instructions anyways, and RBP can
* be used to restore RSP to make objtool happy (see below).
*/
push %_ASM_BP
mov %_ASM_SP, %_ASM_BP

#ifdef CONFIG_X86_64
/*
* Align RSP to a 16-byte boundary (to emulate CPU behavior) before
* creating the synthetic interrupt stack frame for the IRQ/NMI.
*/
and $-16, %rsp
push $__KERNEL_DS
push %rbp
#endif
pushf
push $__KERNEL_CS
CALL_NOSPEC _ASM_ARG1

/*
* "Restore" RSP from RBP, even though IRET has already unwound RSP to
* the correct value. objtool doesn't know the callee will IRET and,
* without the explicit restore, thinks the stack is getting walloped.
* Using an unwind hint is problematic due to x86-64's dynamic alignment.
*/
mov %_ASM_BP, %_ASM_SP
pop %_ASM_BP
RET
SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
SYM_FUNC_START(vmx_do_interrupt_irqoff)
VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
SYM_FUNC_END(vmx_do_interrupt_irqoff)
Loading

0 comments on commit 27b025e

Please sign in to comment.