x86/PV: support data breakpoint extension registers Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar to the generic MSR save/restore logic recently added for HVM. This also moves some debug register related declarations/definition to the header intended for these. Signed-off-by: Jan Beulich --- v2: libxc adjustment put in place (depending on the separately posted http://lists.xenproject.org/archives/html/xen-devel/2014-03/msg03059.html) --- a/tools/libxc/xc_domain_restore.c +++ b/tools/libxc/xc_domain_restore.c @@ -590,8 +590,13 @@ static int buffer_tail_pv(xc_interface * uint32_t vcpuextstate_size) { unsigned int i; - size_t pfnlen, vcpulen; + size_t pfnlen, vcpulen, total; + int alloc = 0; struct domain_info_context *dinfo = &ctx->dinfo; + union { + const unsigned char *raw; + const xen_domctl_ext_vcpucontext_t *evc; + } ptr; /* TODO: handle changing pfntab and vcpu counts */ /* PFN tab */ @@ -634,11 +639,36 @@ static int buffer_tail_pv(xc_interface * ERROR("Error allocating VCPU ctxt tail buffer"); goto free_pfntab; } + alloc = 1; } // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen); - if ( RDEXACT(fd, buf->vcpubuf, vcpulen) ) { - PERROR("Error when reading ctxt"); - goto free_vcpus; + for (total = i = 0, ptr.raw = buf->vcpubuf; ext_vcpucontext; ) { + if ( RDEXACT(fd, buf->vcpubuf + total, vcpulen) ) { + PERROR("Error when reading ctxt"); + goto free_vcpus; + } + total += vcpulen; + for (vcpulen = 0; i < buf->vcpucount; ++i) { + size_t msrlen; + + if ((const unsigned char *)(ptr.evc + 1) > buf->vcpubuf + total) + break; + msrlen = ptr.evc->msr_count * sizeof(xen_domctl_ext_vcpu_msr_t); + vcpulen += msrlen; + ptr.raw += 128 + msrlen + vcpuextstate_size; + } + if (!vcpulen) + break; + if (alloc) { + void *nbuf = realloc(buf->vcpubuf, total + vcpulen); + + if (!nbuf) { + ERROR("Error growing VCPU ctxt tail buffer"); + goto free_vcpus; + } + ptr.raw = nbuf + (ptr.raw - buf->vcpubuf); + buf->vcpubuf = nbuf; + } } /* load shared_info_page */ @@ -1996,6 +2026,8 @@ int xc_domain_restore(xc_interface *xch, vcpup = tailbuf.u.pv.vcpubuf; for ( i = 0; i <= max_vcpu_id; i++ ) { + DECLARE_HYPERCALL_BUFFER(xen_domctl_ext_vcpu_msr_t, msrs); + if ( !(vcpumap[i/64] & (1ULL << (i%64))) ) continue; @@ -2130,9 +2162,25 @@ int xc_domain_restore(xc_interface *xch, goto vcpu_ext_state_restore; memcpy(&domctl.u.ext_vcpucontext, vcpup, 128); vcpup += 128; + if ( domctl.u.ext_vcpucontext.msr_count ) + { + size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs); + + msrs = xc_hypercall_buffer_alloc(xch, msrs, sz); + if ( !msrs ) + { + PERROR("No memory for vcpu%d MSRs", i); + goto out; + } + memcpy(msrs, vcpup, sz); + vcpup += sz; + set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs); + } domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext; domctl.domain = dom; frc = xc_domctl(xch, &domctl); + if ( msrs ) + xc_hypercall_buffer_free(xch, msrs); if ( frc != 0 ) { PERROR("Couldn't set extended vcpu%d info", i); --- a/tools/libxc/xc_domain_save.c +++ b/tools/libxc/xc_domain_save.c @@ -836,6 +836,9 @@ int xc_domain_save(xc_interface *xch, in /* base of the region in which domain memory is mapped */ unsigned char *region_base = NULL; + /* MSR extensions to xen_domctl_ext_vcpucontext_t */ + DECLARE_HYPERCALL_BUFFER(xen_domctl_ext_vcpu_msr_t, msrs); + /* A copy of the CPU eXtended States of the guest. */ DECLARE_HYPERCALL_BUFFER(void, buffer); @@ -1960,16 +1963,36 @@ int xc_domain_save(xc_interface *xch, in domctl.domain = dom; memset(&domctl.u, 0, sizeof(domctl.u)); domctl.u.ext_vcpucontext.vcpu = i; - if ( xc_domctl(xch, &domctl) < 0 ) + frc = xc_domctl(xch, &domctl); + if ( frc < 0 && errno == ENOBUFS && domctl.u.ext_vcpucontext.msr_count ) + { + msrs = xc_hypercall_buffer_alloc(xch, msrs, + domctl.u.ext_vcpucontext.msr_count * + sizeof(*msrs)); + set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs); + frc = msrs ? xc_domctl(xch, &domctl) : -1; + } + if ( frc < 0 ) { PERROR("No extended context for VCPU%d", i); goto out; } if ( wrexact(io_fd, &domctl.u.ext_vcpucontext, 128) ) { - PERROR("Error when writing to state file (2)"); + PERROR("Error when writing to state file (ext ctxt)"); goto out; } + if ( msrs ) + { + if ( wrexact(io_fd, msrs, + domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs)) ) + { + PERROR("Error when writing to state file (MSRs)"); + goto out; + } + xc_hypercall_buffer_free(xch, msrs); + msrs = NULL; + } /* Start to fetch CPU eXtended States */ /* Get buffer size first */ @@ -2134,6 +2157,8 @@ int xc_domain_save(xc_interface *xch, in xc_hypercall_buffer_free_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size))); xc_hypercall_buffer_free_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size))); + if (msrs) + xc_hypercall_buffer_free(xch, msrs); free(pfn_type); free(pfn_batch); --- a/xen/arch/x86/acpi/suspend.c +++ b/xen/arch/x86/acpi/suspend.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -1316,14 +1316,7 @@ static void paravirt_ctxt_switch_to(stru write_cr4(cr4); if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) - { - write_debugreg(0, v->arch.debugreg[0]); - write_debugreg(1, v->arch.debugreg[1]); - write_debugreg(2, v->arch.debugreg[2]); - write_debugreg(3, v->arch.debugreg[3]); - write_debugreg(6, v->arch.debugreg[6]); - write_debugreg(7, v->arch.debugreg[7]); - } + activate_debugregs(v); if ( (v->domain->arch.tsc_mode == TSC_MODE_PVRDTSCP) && boot_cpu_has(X86_FEATURE_RDTSCP) ) --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -52,6 +52,7 @@ long arch_do_domctl( { long ret = 0; bool_t copyback = 0; + unsigned long i; switch ( domctl->cmd ) { @@ -319,7 +320,6 @@ long arch_do_domctl( case XEN_DOMCTL_getmemlist: { - int i; unsigned long max_pfns = domctl->u.getmemlist.max_pfns; uint64_t mfn; struct page_info *page; @@ -645,7 +645,6 @@ long arch_do_domctl( unsigned long mfn = domctl->u.memory_mapping.first_mfn; unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns; int add = domctl->u.memory_mapping.add_mapping; - unsigned long i; ret = -EINVAL; if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */ @@ -809,6 +808,7 @@ long arch_do_domctl( { struct xen_domctl_ext_vcpucontext *evc; struct vcpu *v; + struct xen_domctl_ext_vcpu_msr msr; evc = &domctl->u.ext_vcpucontext; @@ -854,7 +854,42 @@ long arch_do_domctl( evc->vmce.mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2; evc->vmce.mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2; - ret = 0; + i = ret = 0; + if ( boot_cpu_has(X86_FEATURE_DBEXT) ) + { + unsigned int j; + + if ( v->arch.pv_vcpu.dr_mask[0] ) + { + if ( i < evc->msr_count && !ret ) + { + msr.index = MSR_AMD64_DR0_ADDRESS_MASK; + msr.reserved = 0; + msr.value = v->arch.pv_vcpu.dr_mask[0]; + if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) ) + ret = -EFAULT; + } + ++i; + } + for ( j = 0; j < 3; ++j ) + { + if ( !v->arch.pv_vcpu.dr_mask[1 + j] ) + continue; + if ( i < evc->msr_count && !ret ) + { + msr.index = MSR_AMD64_DR1_ADDRESS_MASK + j; + msr.reserved = 0; + msr.value = v->arch.pv_vcpu.dr_mask[1 + j]; + if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) ) + ret = -EFAULT; + } + ++i; + } + } + if ( i > evc->msr_count && !ret ) + ret = -ENOBUFS; + evc->msr_count = i; + vcpu_unpause(v); copyback = 1; } @@ -909,9 +944,49 @@ long arch_do_domctl( ret = vmce_restore_vcpu(v, &vmce); } + else if ( evc->size > offsetof(typeof(*evc), vmce) ) + ret = -EINVAL; else ret = 0; + if ( ret || evc->size <= offsetof(typeof(*evc), msrs) ) + /* nothing */; + else if ( evc->size < offsetof(typeof(*evc), msrs) + + sizeof(evc->msrs) ) + ret = -EINVAL; + else + { + for ( i = 0; i < evc->msr_count; ++i ) + { + ret = -EFAULT; + if ( copy_from_guest_offset(&msr, evc->msrs, i, 1) ) + break; + ret = -EINVAL; + if ( msr.reserved ) + break; + switch ( msr.index ) + { + case MSR_AMD64_DR0_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) || + (msr.value >> 32) ) + break; + v->arch.pv_vcpu.dr_mask[0] = msr.value; + continue; + case MSR_AMD64_DR1_ADDRESS_MASK ... + MSR_AMD64_DR3_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) || + (msr.value >> 32) ) + break; + msr.index -= MSR_AMD64_DR1_ADDRESS_MASK - 1; + v->arch.pv_vcpu.dr_mask[msr.index] = msr.value; + continue; + } + break; + } + if ( i == evc->msr_count ) + ret = 0; + } + domain_unpause(d); } } @@ -921,7 +996,6 @@ long arch_do_domctl( { xen_domctl_cpuid_t *ctl = &domctl->u.cpuid; cpuid_input_t *cpuid = NULL; - int i; for ( i = 0; i < MAX_CPUID_INPUT; i++ ) { --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -2498,6 +2498,23 @@ static int emulate_privileged_op(struct if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) goto fail; break; + + case MSR_AMD64_DR0_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) ) + goto fail; + v->arch.pv_vcpu.dr_mask[0] = msr_content; + if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK ) + wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content); + break; + case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) ) + goto fail; + v->arch.pv_vcpu.dr_mask + [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content; + if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK ) + wrmsrl(regs->_ecx, msr_content); + break; + default: if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 ) break; @@ -2585,6 +2602,21 @@ static int emulate_privileged_op(struct regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); break; + + case MSR_AMD64_DR0_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) ) + goto fail; + regs->eax = v->arch.pv_vcpu.dr_mask[0]; + regs->edx = 0; + break; + case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK: + if ( !boot_cpu_has(X86_FEATURE_DBEXT) ) + goto fail; + regs->eax = v->arch.pv_vcpu.dr_mask + [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1]; + regs->edx = 0; + break; + default: if ( rdmsr_hypervisor_regs(regs->ecx, &val) ) { @@ -3628,7 +3660,27 @@ long do_set_trap_table(XEN_GUEST_HANDLE_ return rc; } -long set_debugreg(struct vcpu *v, int reg, unsigned long value) +void activate_debugregs(const struct vcpu *curr) +{ + ASSERT(curr == current); + + write_debugreg(0, curr->arch.debugreg[0]); + write_debugreg(1, curr->arch.debugreg[1]); + write_debugreg(2, curr->arch.debugreg[2]); + write_debugreg(3, curr->arch.debugreg[3]); + write_debugreg(6, curr->arch.debugreg[6]); + write_debugreg(7, curr->arch.debugreg[7]); + + if ( boot_cpu_has(X86_FEATURE_DBEXT) ) + { + wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]); + wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]); + wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]); + wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]); + } +} + +long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value) { int i; struct vcpu *curr = current; @@ -3709,11 +3761,8 @@ long set_debugreg(struct vcpu *v, int re if ( (v == curr) && !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) { - write_debugreg(0, v->arch.debugreg[0]); - write_debugreg(1, v->arch.debugreg[1]); - write_debugreg(2, v->arch.debugreg[2]); - write_debugreg(3, v->arch.debugreg[3]); - write_debugreg(6, v->arch.debugreg[6]); + activate_debugregs(curr); + break; } } if ( v == curr ) --- a/xen/include/asm-x86/debugreg.h +++ b/xen/include/asm-x86/debugreg.h @@ -64,4 +64,16 @@ #define DR_GLOBAL_EXACT_ENABLE (0x00000200ul) /* Global exact enable */ #define DR_GENERAL_DETECT (0x00002000ul) /* General detect enable */ +#define write_debugreg(reg, val) do { \ + unsigned long __val = val; \ + asm volatile ( "mov %0,%%db" #reg : : "r" (__val) ); \ +} while (0) +#define read_debugreg(reg) ({ \ + unsigned long __val; \ + asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) ); \ + __val; \ +}) +long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value); +void activate_debugregs(const struct vcpu *); + #endif /* _X86_DEBUGREG_H */ --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -374,6 +374,9 @@ struct pv_vcpu unsigned long shadow_ldt_mapcnt; spinlock_t shadow_ldt_lock; + /* data breakpoint extension MSRs */ + uint32_t dr_mask[4]; + /* Deferred VA-based update state. */ bool_t need_update_runstate_area; struct vcpu_time_info pending_system_time; --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -462,17 +462,6 @@ long set_gdt(struct vcpu *d, unsigned long *frames, unsigned int entries); -#define write_debugreg(reg, val) do { \ - unsigned long __val = val; \ - asm volatile ( "mov %0,%%db" #reg : : "r" (__val) ); \ -} while (0) -#define read_debugreg(reg) ({ \ - unsigned long __val; \ - asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) ); \ - __val; \ -}) -long set_debugreg(struct vcpu *p, int reg, unsigned long value); - /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static always_inline void rep_nop(void) { --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -36,7 +36,7 @@ #include "grant_table.h" #include "hvm/save.h" -#define XEN_DOMCTL_INTERFACE_VERSION 0x00000009 +#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a /* * NB. xen_domctl.domain is an IN/OUT parameter for this operation. @@ -563,6 +563,16 @@ typedef struct xen_domctl_pin_mem_cachea DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t); +#if defined(__i386__) || defined(__x86_64__) +struct xen_domctl_ext_vcpu_msr { + uint32_t index; + uint32_t reserved; + uint64_aligned_t value; +}; +typedef struct xen_domctl_ext_vcpu_msr xen_domctl_ext_vcpu_msr_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpu_msr_t); +#endif + /* XEN_DOMCTL_set_ext_vcpucontext */ /* XEN_DOMCTL_get_ext_vcpucontext */ struct xen_domctl_ext_vcpucontext { @@ -582,6 +592,7 @@ struct xen_domctl_ext_vcpucontext { uint16_t sysenter_callback_cs; uint8_t syscall32_disables_events; uint8_t sysenter_disables_events; + uint16_t msr_count; #if defined(__GNUC__) union { uint64_aligned_t mcg_cap; @@ -590,6 +601,7 @@ struct xen_domctl_ext_vcpucontext { #else struct hvm_vmce_vcpu vmce; #endif + XEN_GUEST_HANDLE_64(xen_domctl_ext_vcpu_msr_t) msrs; #endif }; typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;