[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v5] x86/AMD: Add support for AMD's OSVW feature in guests



>>> On 06.02.12 at 18:39, Boris Ostrovsky <boris.ostrovsky@xxxxxxx> wrote:
> # HG changeset patch
> # User Boris Ostrovsky <boris.ostrovsky@xxxxxxx>
> # Date 1328549858 -3600
> # Node ID 3cf8ffd0ab883dd09f943f4d8fb50f5cc1f04cd5
> # Parent  e2722b24dc0962de37215320b05d1bb7c4c42864
> x86/AMD: Add support for AMD's OSVW feature in guests.
> 
> In some cases guests should not provide workarounds for errata even when the
> physical processor is affected. For example, because of erratum 400 on 
> family
> 10h processors a Linux guest will read an MSR (resulting in VMEXIT) before
> going to idle in order to avoid getting stuck in a non-C0 state. This is not
> necessary: HLT and IO instructions are intercepted and therefore there is no
> reason for erratum 400 workaround in the guest.
> 
> This patch allows us to present a guest with certain errata as fixed,
> regardless of the state of actual hardware.
> 
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@xxxxxxx>
> Acked-by: Christoph Egger <Christoph.Egger@xxxxxxx>

In the form below/attached (integration with boot time microcode
loading fixed and trailing white space removed)

Acked-by: Jan Beulich <jbeulich@xxxxxxxx>

-- Jan

In some cases guests should not provide workarounds for errata even when the
physical processor is affected. For example, because of erratum 400 on family
10h processors a Linux guest will read an MSR (resulting in VMEXIT) before
going to idle in order to avoid getting stuck in a non-C0 state. This is not
necessary: HLT and IO instructions are intercepted and therefore there is no
reason for erratum 400 workaround in the guest.

This patch allows us to present a guest with certain errata as fixed,
regardless of the state of actual hardware.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@xxxxxxx>
Acked-by: Christoph Egger <Christoph.Egger@xxxxxxx>
Acked-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -108,6 +108,7 @@ static void amd_xc_cpuid_policy(
                     bitmaskof(X86_FEATURE_SSE4A) |
                     bitmaskof(X86_FEATURE_MISALIGNSSE) |
                     bitmaskof(X86_FEATURE_3DNOWPREFETCH) |
+                    bitmaskof(X86_FEATURE_OSVW) |
                     bitmaskof(X86_FEATURE_XOP) |
                     bitmaskof(X86_FEATURE_FMA4) |
                     bitmaskof(X86_FEATURE_TBM) |
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -83,6 +83,10 @@ static DEFINE_PER_CPU_READ_MOSTLY(void *
 
 static bool_t amd_erratum383_found __read_mostly;
 
+/* OSVW bits */
+static uint64_t osvw_length, osvw_status;
+static DEFINE_SPINLOCK(osvw_lock);
+
 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
 {
     struct vcpu *curr = current;
@@ -902,6 +906,69 @@ static void svm_do_resume(struct vcpu *v
     reset_stack_and_jump(svm_asm_do_resume);
 }
 
+static void svm_guest_osvw_init(struct vcpu *vcpu)
+{
+    if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+        return;
+
+    /*
+     * Guests should see errata 400 and 415 as fixed (assuming that
+     * HLT and IO instructions are intercepted).
+     */
+    vcpu->arch.hvm_svm.osvw.length = (osvw_length >= 3) ? osvw_length : 3;
+    vcpu->arch.hvm_svm.osvw.status = osvw_status & ~(6ULL);
+
+    /*
+     * By increasing VCPU's osvw.length to 3 we are telling the guest that
+     * all osvw.status bits inside that length, including bit 0 (which is
+     * reserved for erratum 298), are valid. However, if host processor's
+     * osvw_len is 0 then osvw_status[0] carries no information. We need to
+     * be conservative here and therefore we tell the guest that erratum 298
+     * is present (because we really don't know).
+     */
+    if ( osvw_length == 0 && boot_cpu_data.x86 == 0x10 )
+        vcpu->arch.hvm_svm.osvw.status |= 1;
+}
+
+void svm_host_osvw_reset()
+{
+    spin_lock(&osvw_lock);
+
+    osvw_length = 64; /* One register (MSRC001_0141) worth of errata */
+    osvw_status = 0;
+
+    spin_unlock(&osvw_lock);
+}
+
+void svm_host_osvw_init()
+{
+    spin_lock(&osvw_lock);
+
+    /*
+     * Get OSVW bits. If bits are not the same on different processors then
+     * choose the worst case (i.e. if erratum is present on one processor and
+     * not on another assume that the erratum is present everywhere).
+     */
+    if ( test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability) )
+    {
+        uint64_t len, status;
+
+        if ( rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) ||
+             rdmsr_safe(MSR_AMD_OSVW_STATUS, status) )
+            len = status = 0;
+
+        if (len < osvw_length)
+            osvw_length = len;
+
+        osvw_status |= status;
+        osvw_status &= (1ULL << osvw_length) - 1;
+    }
+    else
+        osvw_length = osvw_status = 0;
+
+    spin_unlock(&osvw_lock);
+}
+
 static int svm_domain_initialise(struct domain *d)
 {
     return 0;
@@ -930,6 +997,9 @@ static int svm_vcpu_initialise(struct vc
     }
 
     vpmu_initialise(v);
+
+    svm_guest_osvw_init(v);
+
     return 0;
 }
 
@@ -1044,6 +1114,27 @@ static void svm_init_erratum_383(struct 
     }
 }
 
+static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, bool_t 
read)
+{
+    uint eax, ebx, ecx, edx;
+
+    /* Guest OSVW support */
+    hvm_cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+    if ( !test_bit((X86_FEATURE_OSVW & 31), &ecx) )
+        return -1;
+
+    if ( read )
+    {
+        if (msr == MSR_AMD_OSVW_ID_LENGTH)
+            *val = v->arch.hvm_svm.osvw.length;
+        else
+            *val = v->arch.hvm_svm.osvw.status;
+    }
+    /* Writes are ignored */
+
+    return 0;
+}
+
 static int svm_cpu_up(void)
 {
     uint64_t msr_content;
@@ -1094,6 +1185,9 @@ static int svm_cpu_up(void)
     }
 #endif
 
+    /* Initialize OSVW bits to be used by guests */
+    svm_host_osvw_init();
+
     return 0;
 }
 
@@ -1104,6 +1198,8 @@ struct hvm_function_table * __init start
     if ( !test_bit(X86_FEATURE_SVM, &boot_cpu_data.x86_capability) )
         return NULL;
 
+    svm_host_osvw_reset();
+
     if ( svm_cpu_up() )
     {
         printk("SVM: failed to initialise.\n");
@@ -1388,6 +1484,13 @@ static int svm_msr_read_intercept(unsign
         vpmu_do_rdmsr(msr, msr_content);
         break;
 
+    case MSR_AMD_OSVW_ID_LENGTH:
+    case MSR_AMD_OSVW_STATUS:
+        ret = svm_handle_osvw(v, msr, msr_content, 1);
+        if ( ret < 0 )
+            goto gpf;
+        break;
+
     default:
         ret = nsvm_rdmsr(v, msr, msr_content);
         if ( ret < 0 )
@@ -1512,6 +1615,13 @@ static int svm_msr_write_intercept(unsig
          */
         break;
 
+    case MSR_AMD_OSVW_ID_LENGTH:
+    case MSR_AMD_OSVW_STATUS:
+        ret = svm_handle_osvw(v, msr, &msr_content, 0);
+        if ( ret < 0 )
+            goto gpf;
+        break;
+
     default:
         ret = nsvm_wrmsr(v, msr, msr_content);
         if ( ret < 0 )
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -218,6 +218,16 @@ int microcode_update(XEN_GUEST_HANDLE(co
     info->error = 0;
     info->cpu = cpumask_first(&cpu_online_map);
 
+    if ( microcode_ops->start_update )
+    {
+        ret = microcode_ops->start_update();
+        if ( ret != 0 )
+        {
+            xfree(info);
+            return ret;
+        }
+    }
+
     return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
 }
 
@@ -240,6 +250,12 @@ static int __init microcode_init(void)
     if ( !data )
         return -ENOMEM;
 
+    if ( microcode_ops->start_update && microcode_ops->start_update() != 0 )
+    {
+        ucode_mod_map(NULL);
+        return 0;
+    }
+
     softirq_tasklet_init(&tasklet, _do_microcode_update, (unsigned long)data);
 
     for_each_online_cpu ( cpu )
--- a/xen/arch/x86/microcode_amd.c
+++ b/xen/arch/x86/microcode_amd.c
@@ -25,6 +25,7 @@
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/microcode.h>
+#include <asm/hvm/svm/svm.h>
 
 struct equiv_cpu_entry {
     uint32_t installed_cpu;
@@ -71,6 +72,7 @@ struct mpbhdr {
 /* serialize access to the physical write */
 static DEFINE_SPINLOCK(microcode_update_lock);
 
+/* See comment in start_update() for cases when this routine fails */
 static int collect_cpu_info(int cpu, struct cpu_signature *csig)
 {
     struct cpuinfo_x86 *c = &cpu_data[cpu];
@@ -287,7 +289,8 @@ static int cpu_request_microcode(int cpu
     {
         printk(KERN_ERR "microcode: error! Wrong "
                "microcode patch file magic\n");
-        return -EINVAL;
+        error = -EINVAL;
+        goto out;
     }
 
     mc_amd = xmalloc(struct microcode_amd);
@@ -295,7 +298,8 @@ static int cpu_request_microcode(int cpu
     {
         printk(KERN_ERR "microcode: error! "
                "Can not allocate memory for microcode patch\n");
-        return -ENOMEM;
+        error = -ENOMEM;
+        goto out;
     }
 
     error = install_equiv_cpu_table(mc_amd, buf, &offset);
@@ -303,7 +307,8 @@ static int cpu_request_microcode(int cpu
     {
         xfree(mc_amd);
         printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
-        return -EINVAL;
+        error = -EINVAL;
+        goto out;
     }
 
     mc_old = uci->mc.mc_amd;
@@ -337,13 +342,19 @@ static int cpu_request_microcode(int cpu
     /* On success keep the microcode patch for
      * re-apply on resume.
      */
-    if (error == 1)
+    if ( error == 1 )
     {
         xfree(mc_old);
-        return 0;
+        error = 0;
+    }
+    else
+    {
+        xfree(mc_amd);
+        uci->mc.mc_amd = mc_old;
     }
-    xfree(mc_amd);
-    uci->mc.mc_amd = mc_old;
+
+  out:
+    svm_host_osvw_init();
 
     return error;
 }
@@ -395,11 +406,28 @@ err1:
     return -ENOMEM;
 }
 
+static int start_update(void)
+{
+    /*
+     * We assume here that svm_host_osvw_init() will be called on each cpu 
(from
+     * cpu_request_microcode()).
+     *
+     * Note that if collect_cpu_info() returns an error then
+     * cpu_request_microcode() will not invoked thus leaving OSVW bits not
+     * updated. Currently though collect_cpu_info() will not fail on processors
+     * supporting OSVW so we will not deal with this possibility.
+     */
+    svm_host_osvw_reset();
+
+    return 0;
+}
+
 static const struct microcode_ops microcode_amd_ops = {
     .microcode_resume_match           = microcode_resume_match,
     .cpu_request_microcode            = cpu_request_microcode,
     .collect_cpu_info                 = collect_cpu_info,
     .apply_microcode                  = apply_microcode,
+    .start_update                     = start_update,
 };
 
 static __init int microcode_init_amd(void)
--- a/xen/arch/x86/platform_hypercall.c
+++ b/xen/arch/x86/platform_hypercall.c
@@ -166,7 +166,21 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
             break;
 
         guest_from_compat_handle(data, op->u.microcode.data);
+
+        /*
+         * alloc_vpcu() will access data which is modified during
+         * microcode update
+         */
+        while ( !spin_trylock(&vcpu_alloc_lock) )
+            if ( hypercall_preempt_check() )
+            {
+                ret = hypercall_create_continuation(
+                    __HYPERVISOR_platform_op, "h", u_xenpf_op);
+                goto out;
+            }
+
         ret = microcode_update(data, op->u.microcode.length);
+        spin_unlock(&vcpu_alloc_lock);
     }
     break;
 
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -29,6 +29,7 @@
 #include <xsm/xsm.h>
 
 static DEFINE_SPINLOCK(domctl_lock);
+DEFINE_SPINLOCK(vcpu_alloc_lock);
 
 int cpumask_to_xenctl_cpumap(
     struct xenctl_cpumap *xenctl_cpumap, const cpumask_t *cpumask)
@@ -506,6 +507,18 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         /* Needed, for example, to ensure writable p.t. state is synced. */
         domain_pause(d);
 
+        /*
+         * Certain operations (e.g. CPU microcode updates) modify data which is
+         * used during VCPU allocation/initialization
+         */
+        while ( !spin_trylock(&vcpu_alloc_lock) )
+            if ( hypercall_preempt_check() )
+            {
+                ret =  hypercall_create_continuation(
+                    __HYPERVISOR_domctl, "h", u_domctl);
+                goto maxvcpu_out_novcpulock;
+            }
+
         /* We cannot reduce maximum VCPUs. */
         ret = -EINVAL;
         if ( (max < d->max_vcpus) && (d->vcpu[max] != NULL) )
@@ -555,6 +568,9 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         ret = 0;
 
     maxvcpu_out:
+        spin_unlock(&vcpu_alloc_lock);
+
+    maxvcpu_out_novcpulock:
         domain_unpause(d);
         rcu_unlock_domain(d);
     }
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -98,4 +98,7 @@ extern u32 svm_feature_flags;
                                   ~TSC_RATIO_RSVD_BITS )
 #define vcpu_tsc_ratio(v)       TSC_RATIO((v)->domain->arch.tsc_khz, cpu_khz)
 
+extern void svm_host_osvw_reset(void);
+extern void svm_host_osvw_init(void);
+
 #endif /* __ASM_X86_HVM_SVM_H__ */
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -516,6 +516,12 @@ struct arch_svm_struct {
     
     /* AMD lightweight profiling MSR */
     uint64_t guest_lwp_cfg;
+
+    /* OSVW MSRs */
+    struct {
+        u64 length;
+        u64 status;
+    } osvw;
 };
 
 struct vmcb_struct *alloc_vmcb(void);
--- a/xen/include/asm-x86/microcode.h
+++ b/xen/include/asm-x86/microcode.h
@@ -11,6 +11,7 @@ struct microcode_ops {
     int (*cpu_request_microcode)(int cpu, const void *buf, size_t size);
     int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
     int (*apply_microcode)(int cpu);
+    int (*start_update)(void);
 };
 
 struct cpu_signature {
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -69,6 +69,7 @@ void arch_dump_domain_info(struct domain
 
 void arch_vcpu_reset(struct vcpu *v);
 
+extern spinlock_t vcpu_alloc_lock;
 bool_t domctl_lock_acquire(void);
 void domctl_lock_release(void);
 


Attachment: AMD-OSVW-guest
Description: Binary data

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.