[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 03/10] x86/cpuid: Handle leaf 0x1 in guest_cpuid()



The features words, ecx and edx, are already audited as part of the featureset
logic.  The existing leaf 0x80000001 dynamic logic has its SYSCALL adjustment
split out, as the rest of the adjustments are common with leaf 0x1.  The
existing leaf 0x1 feature adjustments from {pv,hvm}_cpuid() are moved
wholesale into guest_cpuid(), although deduped against the common adjustments.

The eax word is family/model/stepping information, and is fine to use as
provided by the toolstack, although with reserved bits cleared.

The ebx word is more problematic.  The low 8 bits are the brand ID and safe to
pass straight through.  The next 8 bits are the CLFLUSH line size.  This value
is forwarded straight from hardware, as nothing good can possibly come of
providing an alternative value to the guest.

The next 8 bits are slightly different between Intel and AMD, but are both
some property of the number of logical cores in the current physical package.
For now, the toolstack value is used unchanged until better topology support
is available.

The final 8 bits are the initial legacy APIC ID.  For HVM guests, this was
overridden to vcpu_id * 2.  The same logic is now applied to PV guests, so
guests don't observe a constant number on all vcpus via their emulated or
faulted view.

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
CC: Jan Beulich <JBeulich@xxxxxxxx>
CC: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>

Boris: This conflicts textually but not functionally with your vPMU
adjustments.  Whichever way round we end up needing to rebase should be easy.
---
 xen/arch/x86/cpuid.c        | 351 +++++++++++++++++++-------------------------
 xen/include/asm-x86/cpuid.h |   6 +-
 2 files changed, 158 insertions(+), 199 deletions(-)

diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index e0a387e..3ecb794 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -176,6 +176,9 @@ static void recalculate_misc(struct cpuid_policy *p)
     switch ( p->x86_vendor )
     {
     case X86_VENDOR_INTEL:
+        p->basic.raw_fms &= 0x0fff3fff;
+        p->basic.apic_id = 0; /* Dynamic. */
+
         p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
         p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
         p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
@@ -194,6 +197,9 @@ static void recalculate_misc(struct cpuid_policy *p)
         break;
 
     case X86_VENDOR_AMD:
+        p->basic.raw_fms &= 0x0fff0fff;
+        p->basic.apic_id = 0; /* Dynamic. */
+
         zero_leaves(p->basic.raw, 0x2, 0x3);
         p->basic.raw[0x9] = EMPTY_LEAF;
 
@@ -502,6 +508,9 @@ void recalculate_cpuid_policy(struct domain *d)
 
     cpuid_featureset_to_policy(fs, p);
 
+    /* Pass host cacheline size through to guests. */
+    p->basic.clflush_size = max->basic.clflush_size;
+
     p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
     p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
                                 d->arch.paging.gfn_bits + PAGE_SHIFT);
@@ -574,7 +583,6 @@ static void pv_cpuid(uint32_t leaf, uint32_t subleaf, 
struct cpuid_leaf *res)
 {
     struct vcpu *curr = current;
     struct domain *currd = curr->domain;
-    const struct cpuid_policy *p = currd->arch.cpuid;
 
     if ( !is_control_domain(currd) && !is_hardware_domain(currd) )
         domain_cpuid(currd, leaf, subleaf, res);
@@ -583,147 +591,6 @@ static void pv_cpuid(uint32_t leaf, uint32_t subleaf, 
struct cpuid_leaf *res)
 
     switch ( leaf )
     {
-    case 0x00000001:
-        res->c = p->basic._1c;
-        res->d = p->basic._1d;
-
-        if ( !is_pvh_domain(currd) )
-        {
-            const struct cpu_user_regs *regs = guest_cpu_user_regs();
-
-            /*
-             * Delete the PVH condition when HVMLite formally replaces PVH,
-             * and HVM guests no longer enter a PV codepath.
-             */
-
-            /*
-             * !!! OSXSAVE handling for PV guests is non-architectural !!!
-             *
-             * Architecturally, the correct code here is simply:
-             *
-             *   if ( curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
-             *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
-             *
-             * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
-             * the XSAVE cpuid flag leaked into guests despite the feature not
-             * being available for use), buggy workarounds where introduced to
-             * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
-             * that Xen also incorrectly leaked OSXSAVE into the guest.
-             *
-             * Furthermore, providing architectural OSXSAVE behaviour to a
-             * many Linux PV guests triggered a further kernel bug when the
-             * fpu code observes that XSAVEOPT is available, assumes that
-             * xsave state had been set up for the task, and follows a wild
-             * pointer.
-             *
-             * Older Linux PVOPS kernels however do require architectural
-             * behaviour.  They observe Xen's leaked OSXSAVE and assume they
-             * can already use XSETBV, dying with a #UD because the shadowed
-             * CR4.OSXSAVE is clear.  This behaviour has been adjusted in all
-             * observed cases via stable backports of the above changeset.
-             *
-             * Therefore, the leaking of Xen's OSXSAVE setting has become a
-             * defacto part of the PV ABI and can't reasonably be corrected.
-             * It can however be restricted to only the enlightened CPUID
-             * view, as seen by the guest kernel.
-             *
-             * The following situations and logic now applies:
-             *
-             * - Hardware without CPUID faulting support and native CPUID:
-             *    There is nothing Xen can do here.  The hosts XSAVE flag will
-             *    leak through and Xen's OSXSAVE choice will leak through.
-             *
-             *    In the case that the guest kernel has not set up OSXSAVE, 
only
-             *    SSE will be set in xcr0, and guest userspace can't do too 
much
-             *    damage itself.
-             *
-             * - Enlightened CPUID or CPUID faulting available:
-             *    Xen can fully control what is seen here.  Guest kernels need
-             *    to see the leaked OSXSAVE via the enlightened path, but
-             *    guest userspace and the native is given architectural
-             *    behaviour.
-             *
-             *    Emulated vs Faulted CPUID is distinguised based on whether a
-             *    #UD or #GP is currently being serviced.
-             */
-            /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
-            if ( (curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ||
-                 (regs->entry_vector == TRAP_invalid_op &&
-                  guest_kernel_mode(curr, regs) &&
-                  (read_cr4() & X86_CR4_OSXSAVE)) )
-                res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
-
-            /*
-             * At the time of writing, a PV domain is the only viable option
-             * for Dom0.  Several interactions between dom0 and Xen for real
-             * hardware setup have unfortunately been implemented based on
-             * state which incorrectly leaked into dom0.
-             *
-             * These leaks are retained for backwards compatibility, but
-             * restricted to the hardware domains kernel only.
-             */
-            if ( is_hardware_domain(currd) && guest_kernel_mode(curr, regs) )
-            {
-                /*
-                 * MTRR used to unconditionally leak into PV guests.  They
-                 * cannot MTRR infrastructure at all, and shouldn't be able to
-                 * see the feature.
-                 *
-                 * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
-                 * trying to use the associated MSRs.  Xenolinux-based PV 
dom0's
-                 * however use the MTRR feature as an indication of the 
presence
-                 * of the XENPF_{add,del,read}_memtype hypercalls.
-                 */
-                if ( cpu_has_mtrr )
-                    res->d |= cpufeat_mask(X86_FEATURE_MTRR);
-
-                /*
-                 * MONITOR never leaked into PV guests, as PV guests cannot
-                 * use the MONITOR/MWAIT instructions.  As such, they require
-                 * the feature to not being present in emulated CPUID.
-                 *
-                 * Modern PVOPS Linux try to be cunning and use native CPUID
-                 * to see if the hardware actually supports MONITOR, and by
-                 * extension, deep C states.
-                 *
-                 * If the feature is seen, deep-C state information is
-                 * obtained from the DSDT and handed back to Xen via the
-                 * XENPF_set_processor_pminfo hypercall.
-                 *
-                 * This mechanism is incompatible with an HVM-based hardware
-                 * domain, and also with CPUID Faulting.
-                 *
-                 * Luckily, Xen can be just as 'cunning', and distinguish an
-                 * emulated CPUID from a faulted CPUID by whether a #UD or #GP
-                 * fault is currently being serviced.  Yuck...
-                 */
-                if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
-                    res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
-
-                /*
-                 * While MONITOR never leaked into PV guests, EIST always used
-                 * to.
-                 *
-                 * Modern PVOPS will only parse P state information from the
-                 * DSDT and return it to Xen if EIST is seen in the emulated
-                 * CPUID information.
-                 */
-                if ( cpu_has_eist )
-                    res->c |= cpufeat_mask(X86_FEATURE_EIST);
-            }
-        }
-
-        if ( vpmu_enabled(curr) &&
-             vpmu_is_set(vcpu_vpmu(curr), VPMU_CPU_HAS_DS) )
-        {
-            res->d |= cpufeat_mask(X86_FEATURE_DS);
-            if ( cpu_has(&current_cpu_data, X86_FEATURE_DTES64) )
-                res->c |= cpufeat_mask(X86_FEATURE_DTES64);
-            if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
-                res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
-        }
-        break;
-
     case 0x0000000a: /* Architectural Performance Monitor Features (Intel) */
         if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
              !vpmu_enabled(curr) )
@@ -740,8 +607,7 @@ static void pv_cpuid(uint32_t leaf, uint32_t subleaf, 
struct cpuid_leaf *res)
         *res = EMPTY_LEAF;
         break;
 
-    case 0x0:
-    case 0x2 ... 0x3:
+    case 0x0 ... 0x3:
     case 0x7 ... 0x9:
     case 0xc ... XSTATE_CPUID:
     case 0x80000000 ... 0xffffffff:
@@ -754,57 +620,11 @@ static void hvm_cpuid(uint32_t leaf, uint32_t subleaf, 
struct cpuid_leaf *res)
 {
     struct vcpu *v = current;
     struct domain *d = v->domain;
-    const struct cpuid_policy *p = d->arch.cpuid;
 
     domain_cpuid(d, leaf, subleaf, res);
 
     switch ( leaf )
     {
-    case 0x1:
-        /* Fix up VLAPIC details. */
-        res->b &= 0x00FFFFFFu;
-        res->b |= (v->vcpu_id * 2) << 24;
-
-        res->c = p->basic._1c;
-        res->d = p->basic._1d;
-
-        /* APIC exposed to guests, but Fast-forward MSR_APIC_BASE.EN back in. 
*/
-        if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
-            res->d &= ~cpufeat_bit(X86_FEATURE_APIC);
-
-        /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
-        if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
-            res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
-
-        /*
-         * PSE36 is not supported in shadow mode.  This bit should be
-         * unilaterally cleared.
-         *
-         * However, an unspecified version of Hyper-V from 2011 refuses
-         * to start as the "cpu does not provide required hw features" if
-         * it can't see PSE36.
-         *
-         * As a workaround, leak the toolstack-provided PSE36 value into a
-         * shadow guest if the guest is already using PAE paging (and won't
-         * care about reverting back to PSE paging).  Otherwise, knoble it, so
-         * a 32bit guest doesn't get the impression that it could try to use
-         * PSE36 paging.
-         */
-        if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
-            res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
-
-        if ( vpmu_enabled(v) &&
-             vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
-        {
-            res->d |= cpufeat_mask(X86_FEATURE_DS);
-            if ( cpu_has(&current_cpu_data, X86_FEATURE_DTES64) )
-                res->c |= cpufeat_mask(X86_FEATURE_DTES64);
-            if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
-                res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
-        }
-
-        break;
-
     case 0xb:
         /* Fix the x2APIC identifier. */
         res->d = v->vcpu_id * 2;
@@ -822,8 +642,7 @@ static void hvm_cpuid(uint32_t leaf, uint32_t subleaf, 
struct cpuid_leaf *res)
             res->a = (res->a & ~0xff) | 3;
         break;
 
-    case 0x0:
-    case 0x2 ... 0x3:
+    case 0x0 ... 0x3:
     case 0x7 ... 0x9:
     case 0xc ... XSTATE_CPUID:
     case 0x80000000 ... 0xffffffff:
@@ -876,8 +695,7 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf,
         default:
             goto legacy;
 
-        case 0x0:
-        case 0x2 ... 0x3:
+        case 0x0 ... 0x3:
         case 0x8 ... 0x9:
         case 0xc:
             *res = p->basic.raw[leaf];
@@ -928,6 +746,141 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf,
      */
     switch ( leaf )
     {
+        const struct cpu_user_regs *regs;
+
+    case 0x1:
+        /* TODO: Rework topology logic. */
+        res->b &= 0x00ffffffu;
+        res->b |= (v->vcpu_id * 2) << 24;
+
+        /* TODO: Rework vPMU control in terms of toolstack choices. */
+        if ( vpmu_enabled(v) &&
+             vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
+        {
+            res->d |= cpufeat_mask(X86_FEATURE_DS);
+            if ( cpu_has(&current_cpu_data, X86_FEATURE_DTES64) )
+                res->c |= cpufeat_mask(X86_FEATURE_DTES64);
+            if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
+                res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
+        }
+
+        if ( has_hvm_container_domain(d) )
+        {
+            /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
+            if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
+                res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+        }
+        else /* PV domain */
+        {
+            regs = guest_cpu_user_regs();
+
+            /*
+             * !!! OSXSAVE handling for PV guests is non-architectural !!!
+             *
+             * Architecturally, the correct code here is simply:
+             *
+             *   if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
+             *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+             *
+             * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
+             * the XSAVE cpuid flag leaked into guests despite the feature not
+             * being available for use), buggy workarounds where introduced to
+             * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
+             * that Xen also incorrectly leaked OSXSAVE into the guest.
+             *
+             * Furthermore, providing architectural OSXSAVE behaviour to a
+             * many Linux PV guests triggered a further kernel bug when the
+             * fpu code observes that XSAVEOPT is available, assumes that
+             * xsave state had been set up for the task, and follows a wild
+             * pointer.
+             *
+             * Older Linux PVOPS kernels however do require architectural
+             * behaviour.  They observe Xen's leaked OSXSAVE and assume they
+             * can already use XSETBV, dying with a #UD because the shadowed
+             * CR4.OSXSAVE is clear.  This behaviour has been adjusted in all
+             * observed cases via stable backports of the above changeset.
+             *
+             * Therefore, the leaking of Xen's OSXSAVE setting has become a
+             * defacto part of the PV ABI and can't reasonably be corrected.
+             * It can however be restricted to only the enlightened CPUID
+             * view, as seen by the guest kernel.
+             *
+             * The following situations and logic now applies:
+             *
+             * - Hardware without CPUID faulting support and native CPUID:
+             *    There is nothing Xen can do here.  The hosts XSAVE flag will
+             *    leak through and Xen's OSXSAVE choice will leak through.
+             *
+             *    In the case that the guest kernel has not set up OSXSAVE, 
only
+             *    SSE will be set in xcr0, and guest userspace can't do too 
much
+             *    damage itself.
+             *
+             * - Enlightened CPUID or CPUID faulting available:
+             *    Xen can fully control what is seen here.  Guest kernels need
+             *    to see the leaked OSXSAVE via the enlightened path, but
+             *    guest userspace and the native is given architectural
+             *    behaviour.
+             *
+             *    Emulated vs Faulted CPUID is distinguised based on whether a
+             *    #UD or #GP is currently being serviced.
+             */
+            /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
+            if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ||
+                 (regs->entry_vector == TRAP_invalid_op &&
+                  guest_kernel_mode(v, regs) &&
+                  (read_cr4() & X86_CR4_OSXSAVE)) )
+                res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
+
+            /*
+             * At the time of writing, a PV domain is the only viable option
+             * for Dom0.  Several interactions between dom0 and Xen for real
+             * hardware setup have unfortunately been implemented based on
+             * state which incorrectly leaked into dom0.
+             *
+             * These leaks are retained for backwards compatibility, but
+             * restricted to the hardware domains kernel only.
+             */
+            if ( is_hardware_domain(d) && guest_kernel_mode(v, regs) )
+            {
+                /*
+                 * MONITOR never leaked into PV guests, as PV guests cannot
+                 * use the MONITOR/MWAIT instructions.  As such, they require
+                 * the feature to not being present in emulated CPUID.
+                 *
+                 * Modern PVOPS Linux try to be cunning and use native CPUID
+                 * to see if the hardware actually supports MONITOR, and by
+                 * extension, deep C states.
+                 *
+                 * If the feature is seen, deep-C state information is
+                 * obtained from the DSDT and handed back to Xen via the
+                 * XENPF_set_processor_pminfo hypercall.
+                 *
+                 * This mechanism is incompatible with an HVM-based hardware
+                 * domain, and also with CPUID Faulting.
+                 *
+                 * Luckily, Xen can be just as 'cunning', and distinguish an
+                 * emulated CPUID from a faulted CPUID by whether a #UD or #GP
+                 * fault is currently being serviced.  Yuck...
+                 */
+                if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
+                    res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
+
+                /*
+                 * While MONITOR never leaked into PV guests, EIST always used
+                 * to.
+                 *
+                 * Modern PVOPS Linux will only parse P state information from
+                 * the DSDT and return it to Xen if EIST is seen in the
+                 * emulated CPUID information.
+                 */
+                if ( cpu_has_eist )
+                    res->c |= cpufeat_mask(X86_FEATURE_EIST);
+            }
+        }
+
+        /* Adjustments common with leaf 0x80000001. */
+        goto common_dynamic_adjustments;
+
     case 0x7:
         switch ( subleaf )
         {
@@ -967,6 +920,12 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf,
         break;
 
     case 0x80000001:
+        /* SYSCALL is hidden outside of long mode on Intel. */
+        if ( p->x86_vendor == X86_VENDOR_INTEL &&
+             has_hvm_container_domain(d) && !hvm_long_mode_enabled(v) )
+            res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
+
+    common_dynamic_adjustments: /* Adjustments common with leaf 1. */
         if ( has_hvm_container_domain(d) )
         {
             /* Fast-forward MSR_APIC_BASE.EN. */
@@ -989,10 +948,6 @@ void guest_cpuid(const struct vcpu *v, uint32_t leaf,
              */
             if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
                 res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
-
-            /* SYSCALL is hidden outside of long mode on Intel. */
-            if ( p->x86_vendor == X86_VENDOR_INTEL && 
!hvm_long_mode_enabled(v) )
-                res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
         }
         else /* PV domain */
         {
diff --git a/xen/include/asm-x86/cpuid.h b/xen/include/asm-x86/cpuid.h
index bc3fc7c..6d1990b 100644
--- a/xen/include/asm-x86/cpuid.h
+++ b/xen/include/asm-x86/cpuid.h
@@ -106,7 +106,11 @@ struct cpuid_policy
             uint32_t max_leaf, vendor_ebx, vendor_ecx, vendor_edx;
 
             /* Leaf 0x1 - Family/model/stepping and features. */
-            uint32_t raw_fms, /* b */:32;
+            uint32_t raw_fms;
+            uint8_t :8,       /* Brand ID. */
+                clflush_size, /* Number of 8-byte blocks per cache line. */
+                lppp,         /* Logical processors per package. */
+                apic_id;      /* Initial APIC ID. */
             union {
                 uint32_t _1c;
                 struct { DECL_BITFIELD(1c); };
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.