[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 6/8] x86/EPT: force reâevaluation of memory type as necessary



The main goal here is to drop the bogus dependency of
epte_get_entry_emt() on d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT].

Any change to state influencing epte_get_entry_emt()'s decision needs
to result in re-calculation. Do this by using the EPT_MISCONFIG VM
exit, storing an invalid memory type into EPT's emt field (leaving the
IOMMU, which doesn't care about memory types, unaffected).

This is being done in a hierarchical manner to keep execution time
down: Initially only the top level directory gets invalidated this way.
Upon access, the involved intermediate page table levels get cleared
back to zero, and the leaf entry gets its field properly set. For 4k
leaves all other entries in the same directory also get processed to
amortize the cost of the extra VM exit (which halved the number of
these VM exits in my testing).

This restoring can result in spurious EPT_MISCONFIG VM exits (since
two vCPU-s may access addresses involving identical page table
structures). Rather than simply returning in such cases (and risking
that such a VM exit results from a real mis-configuration, which
would then result in an endless loop rather than killing the VM), a
per-vCPU flag is being introduced indicating when such a spurious VM
exit might validly happen - if another one occurs right after VM re-
entry, the flag would generally end up being clear, causing the VM
to be killed as before on such VM exits.

Note that putting a reserved memory type value in the EPT structures
isn't formally sanctioned by the specification. Intel isn't willing to
adjust the specification to make this or a similar use of the
EPT_MISCONFIG VM exit formally possible, but they have indicated that
us using this is low risk wrt forward compatibility.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -84,6 +84,8 @@ long arch_do_domctl(
             ret = ioports_permit_access(d, fp, fp + np - 1);
         else
             ret = ioports_deny_access(d, fp, fp + np - 1);
+        if ( !ret )
+            memory_type_changed(d);
     }
     break;
 
@@ -705,6 +707,8 @@ long arch_do_domctl(
                        ret, add ? "removing" : "denying", d->domain_id,
                        mfn, mfn + nr_mfns - 1);
         }
+        /* Do this unconditionally to cover errors on above failure paths. */
+        memory_type_changed(d);
     }
     break;
 
@@ -791,6 +795,8 @@ long arch_do_domctl(
                        "ioport_map: error %ld denying dom%d access to 
[%x,%x]\n",
                        ret, d->domain_id, fmp, fmp + np - 1);
         }
+        if ( !ret )
+            memory_type_changed(d);
     }
     break;
 
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -252,6 +252,9 @@ int hvm_set_guest_pat(struct vcpu *v, u6
 
     if ( !hvm_funcs.set_guest_pat(v, guest_pat) )
         v->arch.hvm_vcpu.pat_cr = guest_pat;
+
+    memory_type_changed(v->domain);
+
     return 1;
 }
 
--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -431,8 +431,12 @@ bool_t mtrr_def_type_msr_set(struct doma
          return 0;
     }
 
-    m->enabled = enabled;
-    m->def_type = def_type;
+    if ( m->enabled != enabled || m->def_type != def_type )
+    {
+        m->enabled = enabled;
+        m->def_type = def_type;
+        memory_type_changed(d);
+    }
 
     return 1;
 }
@@ -452,6 +456,7 @@ bool_t mtrr_fix_range_msr_set(struct dom
                 return 0;
 
         fixed_range_base[row] = msr_content;
+        memory_type_changed(d);
     }
 
     return 1;
@@ -496,6 +501,8 @@ bool_t mtrr_var_range_msr_set(
 
     m->overlapped = is_var_mtrr_overlapped(m);
 
+    memory_type_changed(d);
+
     return 1;
 }
 
@@ -690,6 +697,12 @@ static int hvm_load_mtrr_msr(struct doma
 HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr,
                           1, HVMSR_PER_VCPU);
 
+void memory_type_changed(struct domain *d)
+{
+    if ( iommu_enabled && !iommu_snoop && d->vcpu && d->vcpu[0] )
+        p2m_memory_type_changed(d);
+}
+
 uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn,
                            uint8_t *ipat, bool_t direct_mmio)
 {
@@ -733,8 +746,7 @@ uint8_t epte_get_entry_emt(struct domain
         return MTRR_TYPE_WRBACK;
     }
 
-    gmtrr_mtype = is_hvm_domain(d) && v &&
-                  d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] ?
+    gmtrr_mtype = is_hvm_domain(d) && v ?
                   get_mtrr_type(&v->arch.hvm_vcpu.mtrr, (gfn << PAGE_SHIFT)) :
                   MTRR_TYPE_WRBACK;
 
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3011,6 +3011,16 @@ void vmx_vmexit_handler(struct cpu_user_
         break;
     }
 
+    case EXIT_REASON_EPT_MISCONFIG:
+    {
+        paddr_t gpa;
+
+        __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
+        if ( !ept_handle_misconfig(gpa) )
+            goto exit_and_crash;
+        break;
+    }
+
     case EXIT_REASON_MONITOR_TRAP_FLAG:
         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -200,6 +200,18 @@ void p2m_change_entry_type_global(struct
     p2m_unlock(p2m);
 }
 
+void p2m_memory_type_changed(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    if ( p2m->memory_type_changed )
+    {
+        p2m_lock(p2m);
+        p2m->memory_type_changed(p2m);
+        p2m_unlock(p2m);
+    }
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
                     p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
                     unsigned int *page_order, bool_t locked)
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -660,6 +660,130 @@ static void ept_change_entry_type_global
     ept_sync_domain(p2m);
 }
 
+static bool_t ept_invalidate_emt(mfn_t mfn)
+{
+    ept_entry_t *epte = map_domain_page(mfn_x(mfn));
+    unsigned int i;
+    bool_t changed = 0;
+
+    for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+    {
+        ept_entry_t e = atomic_read_ept_entry(&epte[i]);
+
+        if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
+             e.emt == MTRR_NUM_TYPES )
+            continue;
+
+        e.emt = MTRR_NUM_TYPES;
+        atomic_write_ept_entry(&epte[i], e);
+        changed = 1;
+    }
+
+    unmap_domain_page(epte);
+
+    return changed;
+}
+
+static void ept_memory_type_changed(struct p2m_domain *p2m)
+{
+    unsigned long mfn = ept_get_asr(&p2m->ept);
+
+    if ( !mfn )
+        return;
+
+    if ( ept_invalidate_emt(_mfn(mfn)) )
+        ept_sync_domain(p2m);
+}
+
+bool_t ept_handle_misconfig(uint64_t gpa)
+{
+    struct vcpu *curr = current;
+    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    struct ept_data *ept = &p2m->ept;
+    unsigned int level = ept_get_wl(ept);
+    unsigned long gfn = PFN_DOWN(gpa);
+    unsigned long mfn = ept_get_asr(ept);
+    ept_entry_t *epte;
+    bool_t okay;
+
+    if ( !mfn )
+        return 0;
+
+    p2m_lock(p2m);
+
+    for ( okay = curr->arch.hvm_vmx.ept_spurious_misconfig; ; --level) {
+        ept_entry_t e;
+        unsigned int i;
+
+        epte = map_domain_page(mfn);
+        i = (gfn >> (level * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
+        e = atomic_read_ept_entry(&epte[i]);
+
+        if ( level == 0 || is_epte_superpage(&e) )
+        {
+            uint8_t ipat = 0;
+            struct vcpu *v;
+
+            if ( e.emt != MTRR_NUM_TYPES )
+                break;
+
+            if ( level == 0 )
+            {
+                for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i )
+                {
+                    e = atomic_read_ept_entry(&epte[i]);
+                    if ( e.emt == MTRR_NUM_TYPES )
+                        e.emt = 0;
+                    if ( !is_epte_valid(&e) || !is_epte_present(&e) )
+                        continue;
+                    e.emt = epte_get_entry_emt(p2m->domain, gfn + i,
+                                               _mfn(e.mfn), &ipat,
+                                               e.sa_p2mt == p2m_mmio_direct);
+                    e.ipat = ipat;
+                    atomic_write_ept_entry(&epte[i], e);
+                }
+            }
+            else
+            {
+                e.emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn),
+                                           &ipat,
+                                           e.sa_p2mt == p2m_mmio_direct);
+                e.ipat = ipat;
+                atomic_write_ept_entry(&epte[i], e);
+            }
+
+            for_each_vcpu ( curr->domain, v )
+                v->arch.hvm_vmx.ept_spurious_misconfig = 1;
+            okay = 1;
+            break;
+        }
+
+        if ( e.emt == MTRR_NUM_TYPES )
+        {
+            ASSERT(is_epte_present(&e));
+            e.emt = 0;
+            atomic_write_ept_entry(&epte[i], e);
+            unmap_domain_page(epte);
+
+            ept_invalidate_emt(_mfn(e.mfn));
+            okay = 1;
+        }
+        else if ( is_epte_present(&e) && !e.emt )
+            unmap_domain_page(epte);
+        else
+            break;
+
+        mfn = e.mfn;
+    }
+
+    unmap_domain_page(epte);
+    curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
+    ept_sync_domain(p2m);
+    p2m_unlock(p2m);
+
+    return okay;
+}
+
 static void __ept_sync_domain(void *info)
 {
     struct ept_data *ept = &((struct p2m_domain *)info)->ept;
@@ -697,6 +821,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
     p2m->set_entry = ept_set_entry;
     p2m->get_entry = ept_get_entry;
     p2m->change_entry_type_global = ept_change_entry_type_global;
+    p2m->memory_type_changed = ept_memory_type_changed;
     p2m->audit_p2m = NULL;
 
     /* Set the memory type used when accessing EPT paging structures. */
@@ -737,6 +862,7 @@ static void ept_dump_p2m_table(unsigned 
         [MTRR_TYPE_WRTHROUGH]      = "WT",
         [MTRR_TYPE_WRPROT]         = "WP",
         [MTRR_TYPE_WRBACK]         = "WB",
+        [MTRR_NUM_TYPES]           = "??"
     };
 
     for_each_domain(d)
@@ -750,11 +876,16 @@ static void ept_dump_p2m_table(unsigned 
 
         for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += 1UL << order )
         {
+            char c = 0;
+
             gfn_remainder = gfn;
             table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
 
             for ( i = ept_get_wl(ept); i > 0; i-- )
             {
+                ept_entry = table + (gfn_remainder >> (i * EPT_TABLE_ORDER));
+                if ( ept_entry->emt == MTRR_NUM_TYPES )
+                    c = '?';
                 ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
                 if ( ret != GUEST_TABLE_NORMAL_PAGE )
                     break;
@@ -775,7 +906,7 @@ static void ept_dump_p2m_table(unsigned 
                            memoryTypes[ept_entry->emt][0],
                            memoryTypes[ept_entry->emt][1]
                            ?: ept_entry->emt + '0',
-                           ept_entry->ipat ? '!' : ' ');
+                           c ?: ept_entry->ipat ? '!' : ' ');
 
                 if ( !(record_counter++ % 100) )
                     process_pending_softirqs();
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -815,6 +815,10 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
             ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
         else
             ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
+#ifdef CONFIG_X86
+        if ( !ret )
+            memory_type_changed(d);
+#endif
     }
     break;
 
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -124,6 +124,9 @@ struct arch_vmx_struct {
 
     unsigned long        host_cr0;
 
+    /* Do we need to tolerate a spurious EPT_MISCONFIG VM exit? */
+    bool_t               ept_spurious_misconfig;
+
     /* Is the guest in real mode? */
     uint8_t              vmx_realmode;
     /* Are we emulating rather than VMENTERing? */
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -520,6 +520,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
 void ept_p2m_uninit(struct p2m_domain *p2m);
 
 void ept_walk_table(struct domain *d, unsigned long gfn);
+bool_t ept_handle_misconfig(uint64_t gpa);
 void setup_ept_dump(void);
 
 void update_guest_eip(void);
--- a/xen/include/asm-x86/mtrr.h
+++ b/xen/include/asm-x86/mtrr.h
@@ -88,6 +88,7 @@ extern bool_t mtrr_fix_range_msr_set(str
                                      uint32_t row, uint64_t msr_content);
 extern bool_t mtrr_def_type_msr_set(struct domain *, struct mtrr_state *,
                                     uint64_t msr_content);
+extern void memory_type_changed(struct domain *);
 extern bool_t pat_msr_set(uint64_t *pat, uint64_t msr);
 
 bool_t is_var_mtrr_overlapped(struct mtrr_state *m);
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -233,6 +233,7 @@ struct p2m_domain {
     void               (*change_entry_type_global)(struct p2m_domain *p2m,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
+    void               (*memory_type_changed)(struct p2m_domain *p2m);
     
     void               (*write_p2m_entry)(struct p2m_domain *p2m,
                                           unsigned long gfn, l1_pgentry_t *p,
@@ -506,6 +507,9 @@ void p2m_change_type_range(struct domain
 p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
                            p2m_type_t ot, p2m_type_t nt);
 
+/* Report a change affecting memory types. */
+void p2m_memory_type_changed(struct domain *d);
+
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);


Attachment: EPT-sync-mem-types.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.