[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [V10 PATCH 23/23] PVH xen: introduce vmexit handler for PVH



On Wed, Jul 24, 2013 at 2:59 AM, Mukesh Rathor <mukesh.rathor@xxxxxxxxxx> wrote:
> This patch contains vmx exit handler for PVH guest. Note it contains
> a macro dbgp1 to print vmexit reasons and a lot of other data
> to go with it. It can be enabled by setting pvhdbg to 1. This can be
> very useful debugging for the first few months of testing, after which
> it can be removed at the maintainer's discretion.
>
> Changes in V2:
>   - Move non VMX generic code to arch/x86/hvm/pvh.c
>   - Remove get_gpr_ptr() and use existing decode_register() instead.
>   - Defer call to pvh vmx exit handler until interrupts are enabled. So the
>     caller vmx_pvh_vmexit_handler() handles the NMI/EXT-INT/TRIPLE_FAULT now.
>   - Fix the CPUID (wrongly) clearing bit 24. No need to do this now, set
>     the correct feature bits in CR4 during vmcs creation.
>   - Fix few hard tabs.
>
> Changes in V3:
>   - Lot of cleanup and rework in PVH vm exit handler.
>   - add parameter to emulate_forced_invalid_op().
>
> Changes in V5:
>   - Move pvh.c and emulate_forced_invalid_op related changes to another patch.
>   - Formatting.
>   - Remove vmx_pvh_read_descriptor().
>   - Use SS DPL instead of CS.RPL for CPL.
>   - Remove pvh_user_cpuid() and call pv_cpuid for user mode also.
>
> Changes in V6:
>   - Replace domain_crash_synchronous() with domain_crash().
>
> Changes in V7:
>   - Don't read all selectors on every vmexit. Do that only for the
>     IO instruction vmexit.
>   - Add couple checks and set guest_cr[4] in access_cr4().
>   - Add period after all comments in case that's an issue.
>   - Move making pv_cpuid and emulate_privileged_op public here.
>
> Changes in V8:
>   - Mainly, don't read selectors on vmexit. The macros now come to VMCS
>     to read selectors on demand.

Overall I have the same comment here as I had for the VMCS patch: the
code looks 98% identical.  Substantial differences seem to be:
 - emulation of privileged ops
 - cpuid
 - cr4 handling

It seems like it would be much better to share the codepath and just
put "is_pvh_domain()" in the places where it needs to be different.

More comments / questions inline.

>
> Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>
> ---
>  xen/arch/x86/hvm/vmx/pvh.c |  451 
> +++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 450 insertions(+), 1 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/vmx/pvh.c b/xen/arch/x86/hvm/vmx/pvh.c
> index 8e61d23..ba11967 100644
> --- a/xen/arch/x86/hvm/vmx/pvh.c
> +++ b/xen/arch/x86/hvm/vmx/pvh.c
> @@ -20,9 +20,458 @@
>  #include <asm/hvm/nestedhvm.h>
>  #include <asm/xstate.h>
>
> -/* Implemented in the next patch */
> +#ifndef NDEBUG
> +static int pvhdbg = 0;
> +#define dbgp1(...) do { (pvhdbg == 1) ? printk(__VA_ARGS__) : 0; } while ( 0 
> )
> +#else
> +#define dbgp1(...) ((void)0)
> +#endif
> +
> +/* Returns : 0 == msr read successfully. */
> +static int vmxit_msr_read(struct cpu_user_regs *regs)
> +{
> +    u64 msr_content = 0;
> +
> +    switch ( regs->ecx )
> +    {
> +    case MSR_IA32_MISC_ENABLE:
> +        rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
> +        msr_content |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
> +                       MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL;
> +        break;
> +
> +    default:
> +        /* PVH fixme: see hvm_msr_read_intercept(). */
> +        rdmsrl(regs->ecx, msr_content);
> +        break;

So at the moment you basically pass through all MSR reads (adding
BTS_UNAVAIL and PEBS_UNAVAIL to MISC_ENABLE), but send MSR writes
through the hvm code?

That sounds like it's asking for trouble...

> +    }
> +    regs->eax = (uint32_t)msr_content;
> +    regs->edx = (uint32_t)(msr_content >> 32);
> +    vmx_update_guest_eip();
> +
> +    dbgp1("msr read c:%lx a:%lx d:%lx RIP:%lx RSP:%lx\n", regs->ecx, 
> regs->eax,
> +          regs->edx, regs->rip, regs->rsp);
> +
> +    return 0;
> +}
> +
> +/* Returns : 0 == msr written successfully. */
> +static int vmxit_msr_write(struct cpu_user_regs *regs)
> +{
> +    uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
> +
> +    dbgp1("PVH: msr write:0x%lx. eax:0x%lx edx:0x%lx\n", regs->ecx,
> +          regs->eax, regs->edx);
> +
> +    if ( hvm_msr_write_intercept(regs->ecx, msr_content) == X86EMUL_OKAY )
> +    {
> +        vmx_update_guest_eip();
> +        return 0;
> +    }
> +    return 1;
> +}
> +
> +static int vmxit_debug(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *vp = current;
> +    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +
> +    write_debugreg(6, exit_qualification | 0xffff0ff0);
> +
> +    /* gdbsx or another debugger. Never pause dom0. */
> +    if ( vp->domain->domain_id != 0 && vp->domain->debugger_attached )
> +        domain_pause_for_debugger();
> +    else
> +        hvm_inject_hw_exception(TRAP_debug, HVM_DELIVER_NO_ERROR_CODE);

Hmm, strangely enough, the HVM handler for this doesn't seem to
deliver this exception -- or if it does, I can't quite figure out
where.  What you have here seems like the correct thing to do, but I
would be interested in knowing the reason for the HVM behavior.

> +
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: handled the MTF vmexit. */
> +static int vmxit_mtf(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *vp = current;
> +    int rc = -EINVAL, ss = vp->arch.hvm_vcpu.single_step;
> +
> +    vp->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
> +    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vp->arch.hvm_vmx.exec_control);
> +    vp->arch.hvm_vcpu.single_step = 0;
> +
> +    if ( vp->domain->debugger_attached && ss )
> +    {
> +        domain_pause_for_debugger();
> +        rc = 0;
> +    }
> +    return rc;
> +}
> +
> +static int vmxit_int3(struct cpu_user_regs *regs)
> +{
> +    int ilen = vmx_get_instruction_length();
> +    struct vcpu *vp = current;
> +    struct hvm_trap trap_info = {
> +        .vector = TRAP_int3,
> +        .type = X86_EVENTTYPE_SW_EXCEPTION,
> +        .error_code = HVM_DELIVER_NO_ERROR_CODE,
> +        .insn_len = ilen
> +    };
> +
> +    /* gdbsx or another debugger. Never pause dom0. */
> +    if ( vp->domain->domain_id != 0 && vp->domain->debugger_attached )
> +    {
> +        regs->eip += ilen;
> +        dbgp1("[%d]PVH: domain pause for debugger\n", smp_processor_id());
> +        current->arch.gdbsx_vcpu_event = TRAP_int3;
> +        domain_pause_for_debugger();
> +        return 0;
> +    }
> +    hvm_inject_trap(&trap_info);
> +
> +    return 0;
> +}
> +
> +/* Just like HVM, PVH should be using "cpuid" from the kernel mode. */
> +static int vmxit_invalid_op(struct cpu_user_regs *regs)
> +{
> +    if ( guest_kernel_mode(current, regs) || 
> !emulate_forced_invalid_op(regs) )
> +        hvm_inject_hw_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE);
> +
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: handled the exception. */
> +static int vmxit_exception(struct cpu_user_regs *regs)
> +{
> +    int vector = (__vmread(VM_EXIT_INTR_INFO)) & INTR_INFO_VECTOR_MASK;
> +    int rc = -ENOSYS;

The vmx code here has some handler for faults that happen during a
guest IRET -- is that an issue for PVH?

> +
> +    dbgp1(" EXCPT: vec:%d cs:%lx r.IP:%lx\n", vector,
> +          __vmread(GUEST_CS_SELECTOR), regs->eip);
> +
> +    switch ( vector )
> +    {
> +    case TRAP_debug:
> +        rc = vmxit_debug(regs);
> +        break;
> +
> +    case TRAP_int3:
> +        rc = vmxit_int3(regs);
> +        break;
> +
> +    case TRAP_invalid_op:
> +        rc = vmxit_invalid_op(regs);
> +        break;
> +
> +    case TRAP_no_device:
> +        hvm_funcs.fpu_dirty_intercept();
> +        rc = 0;
> +        break;
> +
> +    default:
> +        printk(XENLOG_G_WARNING
> +               "PVH: Unhandled trap:%d. IP:%lx\n", vector, regs->eip);
> +    }
> +    return rc;
> +}
> +
> +static int vmxit_vmcall(struct cpu_user_regs *regs)
> +{
> +    if ( hvm_do_hypercall(regs) != HVM_HCALL_preempted )
> +        vmx_update_guest_eip();
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: success. */
> +static int access_cr0(struct cpu_user_regs *regs, uint acc_typ, uint64_t 
> *regp)
> +{
> +    struct vcpu *vp = current;
> +
> +    if ( acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
> +    {
> +        unsigned long new_cr0 = *regp;
> +        unsigned long old_cr0 = __vmread(GUEST_CR0);
> +
> +        dbgp1("PVH:writing to CR0. RIP:%lx val:0x%lx\n", regs->rip, *regp);
> +        if ( (u32)new_cr0 != new_cr0 )
> +        {
> +            printk(XENLOG_G_WARNING
> +                   "Guest setting upper 32 bits in CR0: %lx", new_cr0);
> +            return -EPERM;
> +        }
> +
> +        new_cr0 &= ~HVM_CR0_GUEST_RESERVED_BITS;
> +        /* ET is reserved and should always be 1. */
> +        new_cr0 |= X86_CR0_ET;
> +
> +        /* A pvh is not expected to change to real mode. */
> +        if ( (new_cr0 & (X86_CR0_PE | X86_CR0_PG)) !=
> +             (X86_CR0_PG | X86_CR0_PE) )
> +        {
> +            printk(XENLOG_G_WARNING
> +                   "PVH attempting to turn off PE/PG. CR0:%lx\n", new_cr0);
> +            return -EPERM;
> +        }
> +        /* TS going from 1 to 0 */
> +        if ( (old_cr0 & X86_CR0_TS) && ((new_cr0 & X86_CR0_TS) == 0) )
> +            vmx_fpu_enter(vp);
> +
> +        vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = new_cr0;
> +        __vmwrite(GUEST_CR0, new_cr0);
> +        __vmwrite(CR0_READ_SHADOW, new_cr0);
> +    }
> +    else
> +        *regp = __vmread(GUEST_CR0);

The HVM code here just uses hvm_vcpu.guest_cr[] -- is there any reason
not to do the same here?  And in any case, shouldn't it be
CR0_READ_SHADOW?

> +
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: success. */
> +static int access_cr4(struct cpu_user_regs *regs, uint acc_typ, uint64_t 
> *regp)
> +{
> +    if ( acc_typ == VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR )
> +    {
> +        struct vcpu *vp = current;
> +        u64 old_val = __vmread(GUEST_CR4);
> +        u64 new = *regp;
> +
> +        if ( new & HVM_CR4_GUEST_RESERVED_BITS(vp) )
> +        {
> +            printk(XENLOG_G_WARNING
> +                   "PVH guest attempts to set reserved bit in CR4: %lx", 
> new);
> +            hvm_inject_hw_exception(TRAP_gp_fault, 0);
> +            return 0;
> +        }
> +
> +        if ( !(new & X86_CR4_PAE) && hvm_long_mode_enabled(vp) )
> +        {
> +            printk(XENLOG_G_WARNING "Guest cleared CR4.PAE while "
> +                   "EFER.LMA is set");
> +            hvm_inject_hw_exception(TRAP_gp_fault, 0);
> +            return 0;
> +        }
> +
> +        vp->arch.hvm_vcpu.guest_cr[4] = new;
> +
> +        if ( (old_val ^ new) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
> +            vpid_sync_all();

Is it actually allowed for a PVH guest to change modes like this?

I realize that at the moment you're only supporting HAP, but that may
not always be true; would it make sense to call
paging_update_paging_modes() here instead?

> +
> +        __vmwrite(CR4_READ_SHADOW, new);
> +
> +        new &= ~X86_CR4_PAE;         /* PVH always runs with hap enabled. */
> +        new |= X86_CR4_VMXE | X86_CR4_MCE;
> +        __vmwrite(GUEST_CR4, new);

Should you be updating hvm_vcpu.hw_cr[4] to this value?

> +    }
> +    else
> +        *regp = __vmread(CR4_READ_SHADOW);

Same as above re guest_cr[]

> +
> +    return 0;
> +}
> +
> +/* Returns: rc == 0: success, else -errno. */
> +static int vmxit_cr_access(struct cpu_user_regs *regs)
> +{
> +    unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +    uint acc_typ = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
> +    int cr, rc = -EINVAL;
> +
> +    switch ( acc_typ )
> +    {
> +    case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
> +    case VMX_CONTROL_REG_ACCESS_TYPE_MOV_FROM_CR:
> +    {
> +        uint gpr = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
> +        uint64_t *regp = decode_register(gpr, regs, 0);
> +        cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
> +
> +        if ( regp == NULL )
> +            break;
> +
> +        switch ( cr )
> +        {
> +        case 0:
> +            rc = access_cr0(regs, acc_typ, regp);
> +            break;
> +
> +        case 3:
> +            printk(XENLOG_G_ERR "PVH: unexpected cr3 vmexit. rip:%lx\n",
> +                   regs->rip);
> +            domain_crash(current->domain);
> +            break;
> +
> +        case 4:
> +            rc = access_cr4(regs, acc_typ, regp);
> +            break;
> +        }
> +        if ( rc == 0 )
> +            vmx_update_guest_eip();
> +        break;
> +    }
> +
> +    case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
> +    {
> +        struct vcpu *vp = current;
> +        unsigned long cr0 = vp->arch.hvm_vcpu.guest_cr[0] & ~X86_CR0_TS;
> +        vp->arch.hvm_vcpu.hw_cr[0] = vp->arch.hvm_vcpu.guest_cr[0] = cr0;
> +
> +        vmx_fpu_enter(vp);
> +        __vmwrite(GUEST_CR0, cr0);
> +        __vmwrite(CR0_READ_SHADOW, cr0);
> +        vmx_update_guest_eip();
> +        rc = 0;
> +    }
> +    }
> +    return rc;
> +}
> +
> +/*
> + * Note: A PVH guest sets IOPL natively by setting bits in the eflags, and 
> not
> + *       via hypercalls used by a PV.
> + */
> +static int vmxit_io_instr(struct cpu_user_regs *regs)
> +{
> +    struct segment_register seg;
> +    int requested = (regs->rflags & X86_EFLAGS_IOPL) >> 12;
> +    int curr_lvl = (regs->rflags & X86_EFLAGS_VM) ? 3 : 0;
> +
> +    if ( curr_lvl == 0 )
> +    {
> +        hvm_get_segment_register(current, x86_seg_ss, &seg);
> +        curr_lvl = seg.attr.fields.dpl;
> +    }
> +    if ( requested >= curr_lvl && emulate_privileged_op(regs) )
> +        return 0;
> +
> +    hvm_inject_hw_exception(TRAP_gp_fault, regs->error_code);
> +    return 0;
> +}
> +
> +static int pvh_ept_handle_violation(unsigned long qualification,
> +                                    paddr_t gpa, struct cpu_user_regs *regs)
> +{
> +    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
> +    p2m_type_t p2mt;
> +    mfn_t mfn = get_gfn_query_unlocked(current->domain, gfn, &p2mt);
> +
> +    printk(XENLOG_G_ERR "EPT violation %#lx (%c%c%c/%c%c%c), "
> +           "gpa %#"PRIpaddr", mfn %#lx, type %i. IP:0x%lx RSP:0x%lx\n",
> +           qualification,
> +           (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
> +           (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
> +           (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
> +           (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
> +           (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
> +           (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
> +           gpa, mfn_x(mfn), p2mt, regs->rip, regs->rsp);
> +
> +    ept_walk_table(current->domain, gfn);
> +
> +    if ( qualification & EPT_GLA_VALID )
> +    {
> +        gla = __vmread(GUEST_LINEAR_ADDRESS);
> +        printk(XENLOG_G_ERR " --- GLA %#lx\n", gla);
> +    }
> +    hvm_inject_hw_exception(TRAP_gp_fault, 0);
> +    return 0;
> +}
> +
> +/*
> + * Main vm exit handler for PVH . Called from vmx_vmexit_handler().
> + * Note: vmx_asm_vmexit_handler updates rip/rsp/eflags in regs{} struct.
> + */
>  void vmx_pvh_vmexit_handler(struct cpu_user_regs *regs)
>  {
> +    unsigned long exit_qualification;
> +    unsigned int exit_reason = __vmread(VM_EXIT_REASON);
> +    int rc=0, ccpu = smp_processor_id();
> +    struct vcpu *v = current;
> +
> +    dbgp1("PVH:[%d]left VMCS exitreas:%d RIP:%lx RSP:%lx EFLAGS:%lx 
> CR0:%lx\n",
> +          ccpu, exit_reason, regs->rip, regs->rsp, regs->rflags,
> +          __vmread(GUEST_CR0));
> +
> +    switch ( (uint16_t)exit_reason )
> +    {
> +    /* NMI and machine_check are handled by the caller, we handle rest here 
> */
> +    case EXIT_REASON_EXCEPTION_NMI:      /* 0 */
> +        rc = vmxit_exception(regs);
> +        break;
> +
> +    case EXIT_REASON_EXTERNAL_INTERRUPT: /* 1 */
> +        break;              /* handled in vmx_vmexit_handler() */

This seems to be a weird way to do things, but I see this is what they
do in vmx_vmexit_handler() as well, so I guess it makes sense to
follow suit.

What about EXIT_REASON_TRIPLE_FAULT?

[End of in-line comments]

> +
> +    case EXIT_REASON_PENDING_VIRT_INTR:  /* 7 */
> +        /* Disable the interrupt window. */
> +        v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +        __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
> +        break;
> +
> +    case EXIT_REASON_CPUID:              /* 10 */
> +        pv_cpuid(regs);
> +        vmx_update_guest_eip();
> +        break;
> +
> +    case EXIT_REASON_HLT:                /* 12 */
> +        vmx_update_guest_eip();
> +        hvm_hlt(regs->eflags);
> +        break;
> +
> +    case EXIT_REASON_VMCALL:             /* 18 */
> +        rc = vmxit_vmcall(regs);
> +        break;
> +
> +    case EXIT_REASON_CR_ACCESS:          /* 28 */
> +        rc = vmxit_cr_access(regs);
> +        break;
> +
> +    case EXIT_REASON_DR_ACCESS:          /* 29 */
> +        exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        vmx_dr_access(exit_qualification, regs);
> +        break;
> +
> +    case EXIT_REASON_IO_INSTRUCTION:     /* 30 */
> +        vmxit_io_instr(regs);
> +        break;
> +
> +    case EXIT_REASON_MSR_READ:           /* 31 */
> +        rc = vmxit_msr_read(regs);
> +        break;
> +
> +    case EXIT_REASON_MSR_WRITE:          /* 32 */
> +        rc = vmxit_msr_write(regs);
> +        break;
> +
> +    case EXIT_REASON_MONITOR_TRAP_FLAG:  /* 37 */
> +        rc = vmxit_mtf(regs);
> +        break;
> +
> +    case EXIT_REASON_MCE_DURING_VMENTRY: /* 41 */
> +        break;              /* handled in vmx_vmexit_handler() */
> +
> +    case EXIT_REASON_EPT_VIOLATION:      /* 48 */
> +    {
> +        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
> +        exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        rc = pvh_ept_handle_violation(exit_qualification, gpa, regs);
> +        break;
> +    }
> +
> +    default:
> +        rc = 1;
> +        printk(XENLOG_G_ERR
> +               "PVH: Unexpected exit reason:%#x\n", exit_reason);
> +    }
> +
> +    if ( rc )
> +    {
> +        exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        printk(XENLOG_G_WARNING
> +               "PVH: [%d] exit_reas:%d %#x qual:%ld 0x%lx cr0:0x%016lx\n",
> +               ccpu, exit_reason, exit_reason, exit_qualification,
> +               exit_qualification, __vmread(GUEST_CR0));
> +        printk(XENLOG_G_WARNING "PVH: RIP:%lx RSP:%lx EFLAGS:%lx CR3:%lx\n",
> +               regs->rip, regs->rsp, regs->rflags, __vmread(GUEST_CR3));
> +        domain_crash(v->domain);
> +    }
>  }
>
>  /*
> --
> 1.7.2.3
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.