[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

RE: [PATCH RESEND v9 33/36] KVM: VMX: Add VMX_DO_FRED_EVENT_IRQOFF for IRQ/NMI handling



> > +#include "../../entry/calling.h"
> 
> Rather than do the low level PUSH_REGS and POP_REGS, I vote to have core code
> expose a FRED-specific wrapper for invoking external_interrupt().  More below.

Nice idea!

> > +   /*
> > +    * A FRED stack frame has extra 16 bytes of information pushed at the
> > +    * regular stack top compared to an IDT stack frame.
> 
> There is pretty much no chance that anyone remembers the layout of an IDT 
> stack
> frame off the top of their head.  I.e. saying "FRED has 16 bytes more" isn't 
> all
> that useful.  It also fails to capture the fact that FRED stuff a hell of a 
> lot
> more information in those "common" 48 bytes.
> 
> It'll be hard/impossible to capture all of the overload info in a comment, but
> showing the actual layout of the frame would be super helpful, e.g. something
> like
> this
> 
>       /*
>        * FRED stack frames are always 64 bytes:
>        *
>        * ------------------------------
>        * | Bytes  | Usage             |
>        * -----------------------------|
>        * | 63:56  | Reserved          |
>        * | 55:48  | Event Data        |
>          * | 47:40  | SS + Event Info   |
>          * | 39:32  | RSP               |
>        * | 31:24  | RFLAGS            |
>          * | 23:16  | CS + Aux Info     |
>          * |  15:8  | RIP               |
>          * |   7:0  | Error Code        |
>          * ------------------------------
>        */
>        *
>        * Use LEA to get the return RIP and push it, then push an error code.
>        * Note, only NMI handling does an ERETS to the target!  IRQ handling
>        * doesn't need to unmask NMIs and so simply uses CALL+RET, i.e. the
>        * RIP pushed here is only truly consumed for NMIs!

I take these as ASM code does need more love, i.e., nice comments :/

Otherwise only more confusion.

 
> 
> Jumping way back to providing a wrapper for FRED, if we do that, then there's 
> no
> need to include calling.h, and the weird wrinkle about the ERET target kinda 
> goes
> away too.  E.g. provide this in arch/x86/entry/entry_64_fred.S
> 
>       .section .text, "ax"
> 
> /* Note, this is instrumentation safe, and returns via RET, not ERETS! */
> #if IS_ENABLED(CONFIG_KVM_INTEL)
> SYM_CODE_START(fred_irq_entry_from_kvm)
>       FRED_ENTER
>       call external_interrupt
>       FRED_EXIT
>       RET
> SYM_CODE_END(fred_irq_entry_from_kvm)
> EXPORT_SYMBOL_GPL(fred_irq_entry_from_kvm);
> #endif
> 
> and then the KVM side for this particular chunk is more simply:
> 
>       lea 1f(%rip), %rax
>       push %rax
>       push $0         /* FRED error code, 0 for NMI and external
> interrupts */
> 
>       \branch_insn \branch_target
> 1:
>       VMX_DO_EVENT_FRAME_END
>       RET
> 
> 
> Alternatively, the whole thing could be shoved into
> arch/x86/entry/entry_64_fred.S,
> but at a glance I don't think that would be a net positive due to the need to
> handle
> IRQs vs. NMIs.
> 
> > +   \branch_insn \branch_target
> > +
> > +   .if \nmi == 0
> > +   POP_REGS
> > +   .endif
> > +
> > +1:
> > +   /*
> > +    * "Restore" RSP from RBP, even though IRET has already unwound RSP to
> 
> As mentioned above, this is incorrect on two fronts.
> 
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 0ecf4be2c6af..4e90c69a92bf 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -6890,6 +6890,14 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
> >     memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
> >  }
> >
> > +#ifdef CONFIG_X86_FRED
> > +void vmx_do_fred_interrupt_irqoff(unsigned int vector);
> > +void vmx_do_fred_nmi_irqoff(unsigned int vector);
> > +#else
> > +#define vmx_do_fred_interrupt_irqoff(x) BUG()
> > +#define vmx_do_fred_nmi_irqoff(x) BUG()
> > +#endif
> 
> My slight preference is to open code the BUG() as a ud2 in assembly, purely to
> avoid more #ifdefs.
> 
> > +
> >  void vmx_do_interrupt_irqoff(unsigned long entry);
> >  void vmx_do_nmi_irqoff(void);
> >
> > @@ -6932,14 +6940,16 @@ static void handle_external_interrupt_irqoff(struct
> kvm_vcpu *vcpu)
> >  {
> >     u32 intr_info = vmx_get_intr_info(vcpu);
> >     unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
> > -   gate_desc *desc = (gate_desc *)host_idt_base + vector;
> >
> >     if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
> >         "unexpected VM-Exit interrupt info: 0x%x", intr_info))
> >             return;
> >
> >     kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> > -   vmx_do_interrupt_irqoff(gate_offset(desc));
> > +   if (cpu_feature_enabled(X86_FEATURE_FRED))
> > +           vmx_do_fred_interrupt_irqoff(vector);   /* Event type is 0 */
> 
> I strongly prefer to use code to document what's going on.  E.g. the tail 
> comment
> just left me wondering, what event type is 0?  Whereas this makes it quite 
> clear
> that KVM is signaling a hardware interrupt.  The fact that it's a nop as far 
> as
> code generation goes is irrelevant.
> 
>       vmx_do_fred_interrupt_irqoff((EVENT_TYPE_HWINT << 16) | vector);

Better readability.

> 
> > +   else
> > +           vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base
> + vector));
> >     kvm_after_interrupt(vcpu);
> >
> >     vcpu->arch.at_instruction_boundary = true;
> 
> Here's a diff for (hopefully) everything I've suggested above.

Thanks a lot!  I will test it and update the patch in this mail thread.

> 
> ---
>  arch/x86/entry/entry_64_fred.S | 17 ++++++-
>  arch/x86/kernel/traps.c        |  5 --
>  arch/x86/kvm/vmx/vmenter.S     | 84 +++++++++++++++-------------------
>  arch/x86/kvm/vmx/vmx.c         |  7 +--
>  4 files changed, 55 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
> index 12063267d2ac..a973c0bd29f6 100644
> --- a/arch/x86/entry/entry_64_fred.S
> +++ b/arch/x86/entry/entry_64_fred.S
> @@ -10,7 +10,6 @@
>  #include "calling.h"
> 
>       .code64
> -     .section ".noinstr.text", "ax"
> 
>  .macro FRED_ENTER
>       UNWIND_HINT_END_OF_STACK
> @@ -24,6 +23,22 @@
>       POP_REGS
>  .endm
> 
> +     .section .text, "ax"
> +
> +/* Note, this is instrumentation safe, and returns via RET, not ERETS! */
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +SYM_CODE_START(fred_irq_entry_from_kvm)
> +     FRED_ENTER
> +     call external_interrupt
> +     FRED_EXIT
> +     RET
> +SYM_CODE_END(fred_irq_entry_from_kvm)
> +EXPORT_SYMBOL_GPL(fred_irq_entry_from_kvm);
> +#endif
> +
> +     .section ".noinstr.text", "ax"
> +
> +
>  /*
>   * The new RIP value that FRED event delivery establishes is
>   * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3.
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 21eeba7b188f..cbcb83c71dab 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -1566,11 +1566,6 @@ int external_interrupt(struct pt_regs *regs)
>       return 0;
>  }
> 
> -#if IS_ENABLED(CONFIG_KVM_INTEL)
> -/* For KVM VMX to handle IRQs in IRQ induced VM exits. */
> -EXPORT_SYMBOL_GPL(external_interrupt);
> -#endif
> -
>  #endif /* CONFIG_X86_64 */
> 
>  void __init trap_init(void)
> diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
> index 79a4c91d9434..e25df565c3f8 100644
> --- a/arch/x86/kvm/vmx/vmenter.S
> +++ b/arch/x86/kvm/vmx/vmenter.S
> @@ -9,7 +9,6 @@
>  #include <asm/segment.h>
>  #include "kvm-asm-offsets.h"
>  #include "run_flags.h"
> -#include "../../entry/calling.h"
> 
>  #define WORD_SIZE (BITS_PER_LONG / 8)
> 
> @@ -33,15 +32,31 @@
>  #define VCPU_R15     __VCPU_REGS_R15 * WORD_SIZE
>  #endif
> 
> +.macro VMX_DO_EVENT_FRAME_BEGIN
> +     /*
> +      * Unconditionally create a stack frame, getting the correct RSP on the
> +      * stack (for x86-64) would take two instructions anyways, and RBP can
> +      * be used to restore RSP to make objtool happy (see below).
> +      */
> +     push %_ASM_BP
> +     mov %_ASM_SP, %_ASM_BP
> +.endm
> +
> +.macro VMX_DO_EVENT_FRAME_END
> +     /*
> +      * "Restore" RSP from RBP, even though {E,I}RET has already unwound RSP
> +      * to the correct value *in most cases*.  KVM's IRQ handling with FRED
> +      * doesn't do ERETS, and objtool doesn't know the callee will IRET/ERET
> +      * and, without the explicit restore, thinks the stack is getting 
> walloped.
> +      * Using an unwind hint is problematic due to x86-64's dynamic 
> alignment.
> +      */
> +     mov %_ASM_BP, %_ASM_SP
> +     pop %_ASM_BP
> +.endm
> +
>  #ifdef CONFIG_X86_FRED
>  .macro VMX_DO_FRED_EVENT_IRQOFF branch_insn branch_target nmi=0
> -     /*
> -      * Unconditionally create a stack frame, getting the correct RSP on the
> -      * stack (for x86-64) would take two instructions anyways, and RBP can
> -      * be used to restore RSP to make objtool happy (see below).
> -      */
> -     push %_ASM_BP
> -     mov %_ASM_SP, %_ASM_BP
> +     VMX_DO_EVENT_FRAME_BEGIN
> 
>       /*
>        * Don't check the FRED stack level, the call stack leading to this
> @@ -78,43 +93,23 @@
>        * push an error code before invoking the IRQ/NMI handler.
>        *
>        * Use LEA to get the return RIP and push it, then push an error code.
> +      * Note, only NMI handling does an ERETS to the target!  IRQ handling
> +      * doesn't need to unmask NMIs and so simply uses CALL+RET, i.e. the
> +      * RIP pushed here is only truly consumed for NMIs!
>        */
>       lea 1f(%rip), %rax
>       push %rax
>       push $0         /* FRED error code, 0 for NMI and external
> interrupts */
> 
> -     .if \nmi == 0
> -     PUSH_REGS
> -     mov %rsp, %rdi
> -     .endif
> -
>       \branch_insn \branch_target
> -
> -     .if \nmi == 0
> -     POP_REGS
> -     .endif
> -
>  1:
> -     /*
> -      * "Restore" RSP from RBP, even though IRET has already unwound RSP to
> -      * the correct value.  objtool doesn't know the callee will IRET and,
> -      * without the explicit restore, thinks the stack is getting walloped.
> -      * Using an unwind hint is problematic due to x86-64's dynamic 
> alignment.
> -      */
> -     mov %_ASM_BP, %_ASM_SP
> -     pop %_ASM_BP
> +     VMX_DO_EVENT_FRAME_END
>       RET
>  .endm
>  #endif
> 
>  .macro VMX_DO_EVENT_IRQOFF call_insn call_target
> -     /*
> -      * Unconditionally create a stack frame, getting the correct RSP on the
> -      * stack (for x86-64) would take two instructions anyways, and RBP can
> -      * be used to restore RSP to make objtool happy (see below).
> -      */
> -     push %_ASM_BP
> -     mov %_ASM_SP, %_ASM_BP
> +     VMX_DO_EVENT_FRAME_BEGIN
> 
>  #ifdef CONFIG_X86_64
>       /*
> @@ -129,14 +124,7 @@
>       push $__KERNEL_CS
>       \call_insn \call_target
> 
> -     /*
> -      * "Restore" RSP from RBP, even though IRET has already unwound RSP to
> -      * the correct value.  objtool doesn't know the callee will IRET and,
> -      * without the explicit restore, thinks the stack is getting walloped.
> -      * Using an unwind hint is problematic due to x86-64's dynamic 
> alignment.
> -      */
> -     mov %_ASM_BP, %_ASM_SP
> -     pop %_ASM_BP
> +     VMX_DO_EVENT_FRAME_END
>       RET
>  .endm
> 
> @@ -375,11 +363,13 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit,
> SYM_L_GLOBAL)
> 
>  SYM_FUNC_END(__vmx_vcpu_run)
> 
> -#ifdef CONFIG_X86_FRED
>  SYM_FUNC_START(vmx_do_fred_nmi_irqoff)
> +#ifdef CONFIG_X86_FRED
>       VMX_DO_FRED_EVENT_IRQOFF jmp fred_entrypoint_kernel nmi=1
> +#else
> +     ud2
> +#endif
>  SYM_FUNC_END(vmx_do_fred_nmi_irqoff)
> -#endif
> 
>  SYM_FUNC_START(vmx_do_nmi_irqoff)
>       VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
> @@ -438,11 +428,13 @@ SYM_FUNC_END(vmread_error_trampoline)
>  #endif
> 
>  .section .text, "ax"
> -#ifdef CONFIG_X86_FRED
>  SYM_FUNC_START(vmx_do_fred_interrupt_irqoff)
> -     VMX_DO_FRED_EVENT_IRQOFF call external_interrupt
> +#ifdef CONFIG_X86_FRED
> +     VMX_DO_FRED_EVENT_IRQOFF call fred_irq_entry_from_kvm
> +#else
> +     ud2
> +#endif
>  SYM_FUNC_END(vmx_do_fred_interrupt_irqoff)
> -#endif
> 
>  SYM_FUNC_START(vmx_do_interrupt_irqoff)
>       VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index bf757f5071e4..cb4675dd87df 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -6919,13 +6919,8 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
>       memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
>  }
> 
> -#ifdef CONFIG_X86_FRED
>  void vmx_do_fred_interrupt_irqoff(unsigned int vector);
>  void vmx_do_fred_nmi_irqoff(unsigned int vector);
> -#else
> -#define vmx_do_fred_interrupt_irqoff(x) BUG()
> -#define vmx_do_fred_nmi_irqoff(x) BUG()
> -#endif
> 
>  void vmx_do_interrupt_irqoff(unsigned long entry);
>  void vmx_do_nmi_irqoff(void);
> @@ -6976,7 +6971,7 @@ static void handle_external_interrupt_irqoff(struct
> kvm_vcpu *vcpu)
> 
>       kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
>       if (cpu_feature_enabled(X86_FEATURE_FRED))
> -             vmx_do_fred_interrupt_irqoff(vector);   /* Event type is 0 */
> +             vmx_do_fred_interrupt_irqoff((EVENT_TYPE_HWINT << 16) |
> vector);
>       else
>               vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base
> + vector));
>       kvm_after_interrupt(vcpu);
> 
> base-commit: 8961078ffe509a97ec7803b17912e57c47b93fa2
> --




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.