[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v2 1/6] x86/EPT: don't walk entire page tables when globally changing types



Instead leverage the EPT_MISCONFIG VM exit by marking just the top
level entries as needing recalculation of their type, propagating the
the recalculation state down as necessary such that the actual
recalculation gets done upon access.

For this to work, we have to
- restrict the types between which conversions can be done (right now
  only the two types involved in log dirty tracking need to be taken
  care of)
- remember the ranges that log dirty tracking was requested for as well
  as whether global log dirty tracking is in effect

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -110,11 +110,18 @@ int hap_track_dirty_vram(struct domain *
         if ( begin_pfn != dirty_vram->begin_pfn ||
              begin_pfn + nr != dirty_vram->end_pfn )
         {
+            unsigned long ostart = dirty_vram->begin_pfn;
+            unsigned long oend = dirty_vram->end_pfn;
+
             dirty_vram->begin_pfn = begin_pfn;
             dirty_vram->end_pfn = begin_pfn + nr;
 
             paging_unlock(d);
 
+            if ( oend > ostart )
+                p2m_change_type_range(d, ostart, oend,
+                                      p2m_ram_logdirty, p2m_ram_rw);
+
             /* set l1e entries of range within P2M table to be read-only. */
             p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
                                   p2m_ram_rw, p2m_ram_logdirty);
@@ -150,11 +157,16 @@ int hap_track_dirty_vram(struct domain *
              * If zero pages specified while tracking dirty vram
              * then stop tracking
              */
+            begin_pfn = dirty_vram->begin_pfn;
+            nr = dirty_vram->end_pfn - dirty_vram->begin_pfn;
             xfree(dirty_vram);
             d->arch.hvm_domain.dirty_vram = NULL;
         }
 
         paging_unlock(d);
+        if ( nr )
+            p2m_change_type_range(d, begin_pfn, begin_pfn + nr,
+                                  p2m_ram_logdirty, p2m_ram_rw);
     }
 out:
     if ( dirty_bitmap )
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -116,8 +116,14 @@ static int p2m_init_hostp2m(struct domai
 
     if ( p2m )
     {
-        d->arch.p2m = p2m;
-        return 0;
+        p2m->logdirty_ranges = rangeset_new(d, "log-dirty",
+                                            RANGESETF_prettyprint_hex);
+        if ( p2m->logdirty_ranges )
+        {
+            d->arch.p2m = p2m;
+            return 0;
+        }
+        p2m_free_one(p2m);
     }
     return -ENOMEM;
 }
@@ -129,6 +135,7 @@ static void p2m_teardown_hostp2m(struct 
 
     if ( p2m )
     {
+        rangeset_destroy(p2m->logdirty_ranges);
         p2m_free_one(p2m);
         d->arch.p2m = NULL;
     }
@@ -191,12 +198,25 @@ int p2m_init(struct domain *d)
     return rc;
 }
 
+int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
+                          unsigned long end)
+{
+    ASSERT(!p2m_is_nestedp2m(p2m));
+    if ( p2m->global_logdirty ||
+         rangeset_contains_range(p2m->logdirty_ranges, start, end) )
+        return 1;
+    if ( rangeset_overlaps_range(p2m->logdirty_ranges, start, end) )
+        return -1;
+    return 0;
+}
+
 void p2m_change_entry_type_global(struct domain *d,
                                   p2m_type_t ot, p2m_type_t nt)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     p2m_lock(p2m);
     p2m->change_entry_type_global(p2m, ot, nt);
+    p2m->global_logdirty = (nt == p2m_ram_logdirty);
     p2m_unlock(p2m);
 }
 
@@ -713,6 +733,7 @@ void p2m_change_type_range(struct domain
     unsigned long gfn;
     mfn_t mfn;
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int rc = 0;
 
     BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
 
@@ -726,11 +747,22 @@ void p2m_change_type_range(struct domain
         mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order);
         while ( order > PAGE_ORDER_4K )
         {
-            if ( pt != ot )
-                break;
-            if ( !(gfn & ((1UL << order) - 1)) &&
-                 end > (gfn | ((1UL << order) - 1)) )
-                break;
+            unsigned long mask = ~0UL << order;
+
+            /*
+             * Log-dirty ranges starting/ending in the middle of a super page
+             * (with a page split still pending) can't have a consistent type
+             * reported for the full range and hence need the split to be
+             * enforced here.
+             */
+            if ( !p2m_is_changeable(pt) ||
+                 p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
+            {
+                if ( pt != ot )
+                    break;
+                if ( !(gfn & ~mask) && end > (gfn | ~mask) )
+                    break;
+            }
             if ( order == PAGE_ORDER_1G )
                 order = PAGE_ORDER_2M;
             else
@@ -744,6 +776,26 @@ void p2m_change_type_range(struct domain
             break;
     }
 
+    switch ( nt )
+    {
+    case p2m_ram_rw:
+        if ( ot == p2m_ram_logdirty )
+            rc = rangeset_remove_range(p2m->logdirty_ranges, start, end - 1);
+        break;
+    case p2m_ram_logdirty:
+        if ( ot == p2m_ram_rw )
+            rc = rangeset_add_range(p2m->logdirty_ranges, start, end - 1);
+        break;
+    default:
+        break;
+    }
+    if ( rc )
+    {
+        printk(XENLOG_G_ERR "Error %d manipulating Dom%d's log-dirty ranges\n",
+               rc, d->domain_id);
+        domain_crash(d);
+    }
+
     p2m->defer_nested_flush = 0;
     if ( nestedhvm_enabled(d) )
         p2m_flush_nestedp2m(d);
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -187,7 +187,6 @@ static int ept_split_super_page(struct p
         epte->mfn += i * trunk;
         epte->snp = (iommu_enabled && iommu_snoop);
         ASSERT(!epte->rsvd1);
-        ASSERT(!epte->avail1);
         ASSERT(!epte->avail3);
 
         ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access);
@@ -270,7 +269,12 @@ static int ept_next_level(struct p2m_dom
     return GUEST_TABLE_NORMAL_PAGE;
 }
 
-static bool_t ept_invalidate_emt(mfn_t mfn)
+/*
+ * Invalidate (via setting the EMT field to an invalid value) all valid
+ * present entries in the given page table, optionally marking the entries
+ * also for their subtrees needing P2M type re-calculation.
+ */
+static bool_t ept_invalidate_emt(mfn_t mfn, bool_t recalc)
 {
     ept_entry_t *epte = map_domain_page(mfn_x(mfn));
     unsigned int i;
@@ -281,10 +285,12 @@ static bool_t ept_invalidate_emt(mfn_t m
         ept_entry_t e = atomic_read_ept_entry(&epte[i]);
 
         if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
-             e.emt == MTRR_NUM_TYPES )
+             (e.emt == MTRR_NUM_TYPES && (e.recalc || !recalc)) )
             continue;
 
         e.emt = MTRR_NUM_TYPES;
+        if ( recalc )
+            e.recalc = 1;
         atomic_write_ept_entry(&epte[i], e);
         changed = 1;
     }
@@ -294,23 +300,25 @@ static bool_t ept_invalidate_emt(mfn_t m
     return changed;
 }
 
-bool_t ept_handle_misconfig(uint64_t gpa)
+/*
+ * Resolve deliberately mis-configured (EMT field set to an invalid value)
+ * entries in the page table hierarchy for the given GFN:
+ * - calculate the correct value for the EMT field
+ * - if marked so, re-calculate the P2M type
+ * - propagate EMT and re-calculation flag down to the next page table level
+ *   for entries not involved in the translation of the given GFN
+ */
+static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
 {
-    struct vcpu *curr = current;
-    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
     struct ept_data *ept = &p2m->ept;
     unsigned int level = ept_get_wl(ept);
-    unsigned long gfn = PFN_DOWN(gpa);
     unsigned long mfn = ept_get_asr(ept);
     ept_entry_t *epte;
-    int okay;
+    int rc = 0;
 
     if ( !mfn )
         return 0;
 
-    p2m_lock(p2m);
-
-    okay = -curr->arch.hvm_vmx.ept_spurious_misconfig;
     for ( ; ; --level )
     {
         ept_entry_t e;
@@ -340,6 +348,13 @@ bool_t ept_handle_misconfig(uint64_t gpa
                                                _mfn(e.mfn), 0, &ipat,
                                                e.sa_p2mt == p2m_mmio_direct);
                     e.ipat = ipat;
+                    if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                    {
+                         e.sa_p2mt = p2m_is_logdirty_range(p2m, gfn + i, gfn + 
i)
+                                     ? p2m_ram_logdirty : p2m_ram_rw;
+                         ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+                    }
+                    e.recalc = 0;
                     atomic_write_ept_entry(&epte[i], e);
                 }
             }
@@ -348,6 +363,25 @@ bool_t ept_handle_misconfig(uint64_t gpa
                 int emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn),
                                              level * EPT_TABLE_ORDER, &ipat,
                                              e.sa_p2mt == p2m_mmio_direct);
+
+                if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                {
+                     unsigned long mask = ~0UL << (level * EPT_TABLE_ORDER);
+
+                     switch ( p2m_is_logdirty_range(p2m, gfn & mask,
+                                                    gfn | ~mask) )
+                     {
+                     case 0:
+                          e.sa_p2mt = p2m_ram_rw;
+                          break;
+                     case 1:
+                          e.sa_p2mt = p2m_ram_logdirty;
+                          break;
+                     default: /* Force split. */
+                          emt = -1;
+                          break;
+                     }
+                }
                 if ( unlikely(emt < 0) )
                 {
                     if ( ept_split_super_page(p2m, &e, level, level - 1) )
@@ -357,27 +391,31 @@ bool_t ept_handle_misconfig(uint64_t gpa
                         continue;
                     }
                     ept_free_entry(p2m, &e, level);
-                    okay = 0;
+                    rc = -ENOMEM;
                     break;
                 }
                 e.emt = emt;
                 e.ipat = ipat;
+                if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
+                    ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access);
+                e.recalc = 0;
                 atomic_write_ept_entry(&epte[i], e);
             }
 
-            okay = 1;
+            rc = 1;
             break;
         }
 
         if ( e.emt == MTRR_NUM_TYPES )
         {
             ASSERT(is_epte_present(&e));
-            ept_invalidate_emt(_mfn(e.mfn));
+            ept_invalidate_emt(_mfn(e.mfn), e.recalc);
             smp_wmb();
             e.emt = 0;
+            e.recalc = 0;
             atomic_write_ept_entry(&epte[i], e);
             unmap_domain_page(epte);
-            okay = 1;
+            rc = 1;
         }
         else if ( is_epte_present(&e) && !e.emt )
             unmap_domain_page(epte);
@@ -388,18 +426,34 @@ bool_t ept_handle_misconfig(uint64_t gpa
     }
 
     unmap_domain_page(epte);
-    if ( okay > 0 )
+    if ( rc )
     {
         struct vcpu *v;
 
-        for_each_vcpu ( curr->domain, v )
+        for_each_vcpu ( p2m->domain, v )
             v->arch.hvm_vmx.ept_spurious_misconfig = 1;
     }
+
+    return rc;
+}
+
+bool_t ept_handle_misconfig(uint64_t gpa)
+{
+    struct vcpu *curr = current;
+    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    bool_t spurious;
+    int rc;
+
+    p2m_lock(p2m);
+
+    spurious = curr->arch.hvm_vmx.ept_spurious_misconfig;
+    rc = resolve_misconfig(p2m, PFN_DOWN(gpa));
     curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
     ept_sync_domain(p2m);
+
     p2m_unlock(p2m);
 
-    return !!okay;
+    return rc >= !spurious;
 }
 
 /*
@@ -416,12 +470,11 @@ ept_set_entry(struct p2m_domain *p2m, un
     unsigned long gfn_remainder = gfn;
     int i, target = order / EPT_TABLE_ORDER;
     int rc = 0;
-    int ret = 0;
     bool_t direct_mmio = (p2mt == p2m_mmio_direct);
     uint8_t ipat = 0;
     int need_modify_vtd_table = 1;
     int vtd_pte_present = 0;
-    int needs_sync = 1;
+    int ret, needs_sync = -1;
     ept_entry_t old_entry = { .epte = 0 };
     ept_entry_t new_entry = { .epte = 0 };
     struct ept_data *ept = &p2m->ept;
@@ -439,12 +492,23 @@ ept_set_entry(struct p2m_domain *p2m, un
          (order % EPT_TABLE_ORDER) )
         return -EINVAL;
 
+    /* Carry out any eventually pending earlier changes first. */
+    ret = resolve_misconfig(p2m, gfn);
+    if ( ret < 0 )
+    {
+        ept_sync_domain(p2m);
+        return ret;
+    }
+    if ( ret > 0 )
+        needs_sync = 1;
+
     ASSERT((target == 2 && hvm_hap_has_1gb()) ||
            (target == 1 && hvm_hap_has_2mb()) ||
            (target == 0));
 
     table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
 
+    ret = GUEST_TABLE_MAP_FAILED;
     for ( i = ept_get_wl(ept); i > target; i-- )
     {
         ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i);
@@ -478,7 +542,7 @@ ept_set_entry(struct p2m_domain *p2m, un
         /* We reached the target level. */
 
         /* No need to flush if the old entry wasn't valid */
-        if ( !is_epte_present(ept_entry) )
+        if ( needs_sync < 0 && !is_epte_present(ept_entry) )
             needs_sync = 0;
 
         /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 
2MiB),
@@ -596,6 +660,7 @@ static mfn_t ept_get_entry(struct p2m_do
     u32 index;
     int i;
     int ret = 0;
+    bool_t recalc = 0;
     mfn_t mfn = _mfn(INVALID_MFN);
     struct ept_data *ept = &p2m->ept;
 
@@ -611,6 +676,8 @@ static mfn_t ept_get_entry(struct p2m_do
     for ( i = ept_get_wl(ept); i > 0; i-- )
     {
     retry:
+        if ( table[gfn_remainder >> (i * EPT_TABLE_ORDER)].recalc )
+            recalc = 1;
         ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
         if ( !ret )
             goto out;
@@ -657,7 +724,12 @@ static mfn_t ept_get_entry(struct p2m_do
 
     if ( is_epte_valid(ept_entry) )
     {
-        *t = ept_entry->sa_p2mt;
+        if ( (recalc || ept_entry->recalc) &&
+             p2m_is_changeable(ept_entry->sa_p2mt) )
+            *t = p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
+                                                      : p2m_ram_rw;
+        else
+            *t = ept_entry->sa_p2mt;
         *a = ept_entry->access;
 
         mfn = _mfn(ept_entry->mfn);
@@ -733,53 +805,18 @@ out:
     return;
 }
 
-/*
- * Walk the whole p2m table, changing any entries of the old type
- * to the new type.  This is used in hardware-assisted paging to
- * quickly enable or diable log-dirty tracking
- */
-static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level,
-                                       p2m_type_t ot, p2m_type_t nt)
-{
-    ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn));
-
-    for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( !is_epte_valid(epte + i) )
-            continue;
-
-        if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) )
-            ept_change_entry_type_page(_mfn(epte[i].mfn),
-                                       ept_page_level - 1, ot, nt);
-        else
-        {
-            e = atomic_read_ept_entry(&epte[i]);
-            if ( e.sa_p2mt != ot )
-                continue;
-
-            e.sa_p2mt = nt;
-            ept_p2m_type_to_flags(&e, nt, e.access);
-            atomic_write_ept_entry(&epte[i], e);
-        }
-    }
-
-    unmap_domain_page(epte);
-}
-
 static void ept_change_entry_type_global(struct p2m_domain *p2m,
                                          p2m_type_t ot, p2m_type_t nt)
 {
-    struct ept_data *ept = &p2m->ept;
-    if ( ept_get_asr(ept) == 0 )
-        return;
+    unsigned long mfn = ept_get_asr(&p2m->ept);
 
-    BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
-    BUG_ON(p2m_is_mmio(ot) || p2m_is_mmio(nt));
+    if ( !mfn || ot == nt )
+        return;
 
-    ept_change_entry_type_page(_mfn(ept_get_asr(ept)),
-                               ept_get_wl(ept), ot, nt);
+    BUG_ON(!p2m_is_changeable(ot) || !p2m_is_changeable(nt));
 
-    ept_sync_domain(p2m);
+    if ( ept_invalidate_emt(_mfn(mfn), 1) )
+        ept_sync_domain(p2m);
 }
 
 static void ept_memory_type_changed(struct p2m_domain *p2m)
@@ -789,7 +826,7 @@ static void ept_memory_type_changed(stru
     if ( !mfn )
         return;
 
-    if ( ept_invalidate_emt(_mfn(mfn)) )
+    if ( ept_invalidate_emt(_mfn(mfn), 0) )
         ept_sync_domain(p2m);
 }
 
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -38,7 +38,7 @@ typedef union {
         ipat        :   1,  /* bit 6 - Ignore PAT memory type */
         sp          :   1,  /* bit 7 - Is this a superpage? */
         rsvd1       :   2,  /* bits 9:8 - Reserved for future use */
-        avail1      :   1,  /* bit 10 - Software available 1 */
+        recalc      :   1,  /* bit 10 - Software available 1 */
         snp         :   1,  /* bit 11 - VT-d snoop control in shared
                                EPT/VT-d usage */
         mfn         :   40, /* bits 51:12 - Machine physical frame number */
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -139,6 +139,10 @@ typedef unsigned int p2m_query_t;
                       | p2m_to_mask(p2m_grant_map_ro)   \
                       | p2m_to_mask(p2m_ram_shared) )
 
+/* Types that can be subject to bulk transitions. */
+#define P2M_CHANGEABLE_TYPES (p2m_to_mask(p2m_ram_rw) \
+                              | p2m_to_mask(p2m_ram_logdirty) )
+
 #define P2M_POD_TYPES (p2m_to_mask(p2m_populate_on_demand))
 
 /* Pageable types */
@@ -167,6 +171,7 @@ typedef unsigned int p2m_query_t;
 #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES)
 #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
 #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
+#define p2m_is_changeable(_t) (p2m_to_mask(_t) & P2M_CHANGEABLE_TYPES)
 #define p2m_is_pod(_t) (p2m_to_mask(_t) & P2M_POD_TYPES)
 #define p2m_is_grant(_t) (p2m_to_mask(_t) & P2M_GRANT_TYPES)
 /* Grant types are *not* considered valid, because they can be
@@ -209,6 +214,11 @@ struct p2m_domain {
      * threaded on in LRU order. */
     struct list_head   np2m_list;
 
+    /* Host p2m: Log-dirty ranges registered for the domain. */
+    struct rangeset   *logdirty_ranges;
+
+    /* Host p2m: Global log-dirty mode enabled for the domain. */
+    bool_t             global_logdirty;
 
     /* Host p2m: when this flag is set, don't flush all the nested-p2m 
      * tables on every host-p2m change.  The setter of this flag 
@@ -510,6 +520,9 @@ p2m_type_t p2m_change_type(struct domain
 /* Report a change affecting memory types. */
 void p2m_memory_type_changed(struct domain *d);
 
+int p2m_is_logdirty_range(struct p2m_domain *, unsigned long start,
+                          unsigned long end);
+
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);


Attachment: EPT-replace-cetg.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.