[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 3/4] x86/HVM: implement memory read caching



Emulation requiring device model assistance uses a form of instruction
re-execution, assuming that the second (and any further) pass takes
exactly the same path. This is a valid assumption as far use of CPU
registers goes (as those can't change without any other instruction
executing in between), but is wrong for memory accesses. In particular
it has been observed that Windows might page out buffers underneath an
instruction currently under emulation (hitting between two passes). If
the first pass translated a linear address successfully, any subsequent
pass needs to do so too, yielding the exact same translation.

Introduce a cache (used by just guest page table accesses for now) to
make sure above described assumption holds. This is a very simplistic
implementation for now: Only exact matches are satisfied (no overlaps or
partial reads or anything).

As to the actual data page in this scenario, there are a couple of
aspects to take into consideration:
- We must be talking about an insn accessing two locations (two memory
  ones, one of which is MMIO, or a memory and an I/O one).
- If the non I/O / MMIO side is being read, the re-read (if it occurs at
  all) is having its result discarded, by taking the shortcut through
  the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note
  how, among all the re-issue sanity checks there, we avoid comparing
  the actual data.
- If the non I/O / MMIO side is being written, it is the OSes
  responsibility to avoid actually moving page contents to disk while
  there might still be a write access in flight - this is no different
  in behavior from bare hardware.
- Read-modify-write accesses are, as always, complicated, and while we
  deal with them better nowadays than we did in the past, we're still
  not quite there to guarantee hardware like behavior in all cases
  anyway. Nothing is getting worse by the changes made here, afaict.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
Acked-by: Tim Deegan <tim@xxxxxxx>
Reviewed-by: Paul Durrant <paul.durrant@xxxxxxxxxx>
---
v3: Add text about the actual data page to the description.
v2: Re-base.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -27,6 +27,18 @@
 #include <asm/hvm/svm/svm.h>
 #include <asm/vm_event.h>
 
+struct hvmemul_cache
+{
+    unsigned int max_ents;
+    unsigned int num_ents;
+    struct {
+        paddr_t gpa:PADDR_BITS;
+        unsigned int size:(BITS_PER_LONG - PADDR_BITS) / 2;
+        unsigned int level:(BITS_PER_LONG - PADDR_BITS) / 2;
+        unsigned long data;
+    } ents[];
+};
+
 static void hvmtrace_io_assist(const ioreq_t *p)
 {
     unsigned int size, event;
@@ -541,7 +553,7 @@ static int hvmemul_do_mmio_addr(paddr_t
  */
 static void *hvmemul_map_linear_addr(
     unsigned long linear, unsigned int bytes, uint32_t pfec,
-    struct hvm_emulate_ctxt *hvmemul_ctxt)
+    struct hvm_emulate_ctxt *hvmemul_ctxt, struct hvmemul_cache *cache)
 {
     struct vcpu *curr = current;
     void *err, *mapping;
@@ -586,7 +598,7 @@ static void *hvmemul_map_linear_addr(
         ASSERT(mfn_x(*mfn) == 0);
 
         res = hvm_translate_get_page(curr, addr, true, pfec,
-                                     &pfinfo, &page, NULL, &p2mt);
+                                     &pfinfo, &page, NULL, &p2mt, cache);
 
         switch ( res )
         {
@@ -702,6 +714,8 @@ static int hvmemul_linear_to_phys(
     gfn_t gfn, ngfn;
     unsigned long done, todo, i, offset = addr & ~PAGE_MASK;
     int reverse;
+    struct hvmemul_cache *cache = pfec & PFEC_insn_fetch
+                                  ? NULL : curr->arch.hvm.data_cache;
 
     /*
      * Clip repetitions to a sensible maximum. This avoids extensive looping in
@@ -731,7 +745,7 @@ static int hvmemul_linear_to_phys(
             return rc;
         gfn = gaddr_to_gfn(gaddr);
     }
-    else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, NULL),
+    else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, cache),
                      INVALID_GFN) )
     {
         if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
@@ -747,7 +761,7 @@ static int hvmemul_linear_to_phys(
     {
         /* Get the next PFN in the range. */
         addr += reverse ? -PAGE_SIZE : PAGE_SIZE;
-        ngfn = paging_gla_to_gfn(curr, addr, &pfec, NULL);
+        ngfn = paging_gla_to_gfn(curr, addr, &pfec, cache);
 
         /* Is it contiguous with the preceding PFNs? If not then we're done. */
         if ( gfn_eq(ngfn, INVALID_GFN) ||
@@ -1073,7 +1087,10 @@ static int linear_read(unsigned long add
                        uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     pagefault_info_t pfinfo;
-    int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo);
+    int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo,
+                                        (pfec & PFEC_insn_fetch
+                                         ? NULL
+                                         : current->arch.hvm.data_cache));
 
     switch ( rc )
     {
@@ -1270,7 +1287,8 @@ static int hvmemul_write(
 
     if ( !known_gla(addr, bytes, pfec) )
     {
-        mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
+        mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
+                                          current->arch.hvm.data_cache);
         if ( IS_ERR(mapping) )
              return ~PTR_ERR(mapping);
     }
@@ -1312,7 +1330,8 @@ static int hvmemul_rmw(
 
     if ( !known_gla(addr, bytes, pfec) )
     {
-        mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
+        mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
+                                          current->arch.hvm.data_cache);
         if ( IS_ERR(mapping) )
             return ~PTR_ERR(mapping);
     }
@@ -1466,7 +1485,8 @@ static int hvmemul_cmpxchg(
     else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
         pfec |= PFEC_user_mode;
 
-    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
+    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
+                                      curr->arch.hvm.data_cache);
     if ( IS_ERR(mapping) )
         return ~PTR_ERR(mapping);
 
@@ -2373,6 +2393,7 @@ static int _hvm_emulate_one(struct hvm_e
     {
         vio->mmio_cache_count = 0;
         vio->mmio_insn_bytes = 0;
+        curr->arch.hvm.data_cache->num_ents = 0;
     }
     else
     {
@@ -2591,7 +2612,7 @@ void hvm_emulate_init_per_insn(
                                         &addr) &&
              hvm_copy_from_guest_linear(hvmemul_ctxt->insn_buf, addr,
                                         sizeof(hvmemul_ctxt->insn_buf),
-                                        pfec | PFEC_insn_fetch,
+                                        pfec | PFEC_insn_fetch, NULL,
                                         NULL) == HVMTRANS_okay) ?
             sizeof(hvmemul_ctxt->insn_buf) : 0;
     }
@@ -2664,9 +2685,35 @@ void hvm_dump_emulation_state(const char
            hvmemul_ctxt->insn_buf);
 }
 
+struct hvmemul_cache *hvmemul_cache_init(unsigned int nents)
+{
+    struct hvmemul_cache *cache = xmalloc_bytes(offsetof(struct hvmemul_cache,
+                                                         ents[nents]));
+
+    if ( cache )
+    {
+        cache->num_ents = 0;
+        cache->max_ents = nents;
+    }
+
+    return cache;
+}
+
 bool hvmemul_read_cache(const struct hvmemul_cache *cache, paddr_t gpa,
                         unsigned int level, void *buffer, unsigned int size)
 {
+    unsigned int i;
+
+    ASSERT(size <= sizeof(cache->ents->data));
+
+    for ( i = 0; i < cache->num_ents; ++i )
+        if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa &&
+             cache->ents[i].size == size )
+        {
+            memcpy(buffer, &cache->ents[i].data, size);
+            return true;
+        }
+
     return false;
 }
 
@@ -2674,6 +2721,35 @@ void hvmemul_write_cache(struct hvmemul_
                          unsigned int level, const void *buffer,
                          unsigned int size)
 {
+    unsigned int i;
+
+    if ( size > sizeof(cache->ents->data) )
+    {
+        ASSERT_UNREACHABLE();
+        return;
+    }
+
+    for ( i = 0; i < cache->num_ents; ++i )
+        if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa &&
+             cache->ents[i].size == size )
+        {
+            memcpy(&cache->ents[i].data, buffer, size);
+            return;
+        }
+
+    if ( unlikely(i >= cache->max_ents) )
+    {
+        ASSERT_UNREACHABLE();
+        return;
+    }
+
+    cache->ents[i].level = level;
+    cache->ents[i].gpa   = gpa;
+    cache->ents[i].size  = size;
+
+    memcpy(&cache->ents[i].data, buffer, size);
+
+    cache->num_ents = i + 1;
 }
 
 /*
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1498,6 +1498,17 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
     v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
 
+    /*
+     * Leaving aside the insn fetch, for which we don't use this cache, no
+     * insn can access more than 8 independent linear addresses (AVX2
+     * gathers being the worst). Each such linear range can span a page
+     * boundary, i.e. require two page walks.
+     */
+    v->arch.hvm.data_cache = hvmemul_cache_init(CONFIG_PAGING_LEVELS * 8 * 2);
+    rc = -ENOMEM;
+    if ( !v->arch.hvm.data_cache )
+        goto fail4;
+
     rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */
     if ( rc != 0 )
         goto fail4;
@@ -1527,6 +1538,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
  fail5:
     free_compat_arg_xlat(v);
  fail4:
+    hvmemul_cache_destroy(v->arch.hvm.data_cache);
     hvm_funcs.vcpu_destroy(v);
  fail3:
     vlapic_destroy(v);
@@ -1549,6 +1561,8 @@ void hvm_vcpu_destroy(struct vcpu *v)
 
     free_compat_arg_xlat(v);
 
+    hvmemul_cache_destroy(v->arch.hvm.data_cache);
+
     tasklet_kill(&v->arch.hvm.assert_evtchn_irq_tasklet);
     hvm_funcs.vcpu_destroy(v);
 
@@ -2923,7 +2937,7 @@ void hvm_task_switch(
     }
 
     rc = hvm_copy_from_guest_linear(
-        &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
+        &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL);
     if ( rc == HVMTRANS_bad_linear_to_gfn )
         hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
     if ( rc != HVMTRANS_okay )
@@ -2970,7 +2984,7 @@ void hvm_task_switch(
         goto out;
 
     rc = hvm_copy_from_guest_linear(
-        &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
+        &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL);
     if ( rc == HVMTRANS_bad_linear_to_gfn )
         hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
     /*
@@ -3081,7 +3095,7 @@ void hvm_task_switch(
 enum hvm_translation_result hvm_translate_get_page(
     struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
     pagefault_info_t *pfinfo, struct page_info **page_p,
-    gfn_t *gfn_p, p2m_type_t *p2mt_p)
+    gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache)
 {
     struct page_info *page;
     p2m_type_t p2mt;
@@ -3089,7 +3103,7 @@ enum hvm_translation_result hvm_translat
 
     if ( linear )
     {
-        gfn = paging_gla_to_gfn(v, addr, &pfec, NULL);
+        gfn = paging_gla_to_gfn(v, addr, &pfec, cache);
 
         if ( gfn_eq(gfn, INVALID_GFN) )
         {
@@ -3161,7 +3175,7 @@ enum hvm_translation_result hvm_translat
 #define HVMCOPY_linear     (1u<<2)
 static enum hvm_translation_result __hvm_copy(
     void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags,
-    uint32_t pfec, pagefault_info_t *pfinfo)
+    uint32_t pfec, pagefault_info_t *pfinfo, struct hvmemul_cache *cache)
 {
     gfn_t gfn;
     struct page_info *page;
@@ -3194,8 +3208,8 @@ static enum hvm_translation_result __hvm
 
         count = min_t(int, PAGE_SIZE - gpa, todo);
 
-        res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear,
-                                     pfec, pfinfo, &page, &gfn, &p2mt);
+        res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec,
+                                     pfinfo, &page, &gfn, &p2mt, cache);
         if ( res != HVMTRANS_okay )
             return res;
 
@@ -3242,14 +3256,14 @@ enum hvm_translation_result hvm_copy_to_
     paddr_t paddr, void *buf, int size, struct vcpu *v)
 {
     return __hvm_copy(buf, paddr, size, v,
-                      HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL);
+                      HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL, NULL);
 }
 
 enum hvm_translation_result hvm_copy_from_guest_phys(
     void *buf, paddr_t paddr, int size)
 {
     return __hvm_copy(buf, paddr, size, current,
-                      HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL);
+                      HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL, NULL);
 }
 
 enum hvm_translation_result hvm_copy_to_guest_linear(
@@ -3258,16 +3272,17 @@ enum hvm_translation_result hvm_copy_to_
 {
     return __hvm_copy(buf, addr, size, current,
                       HVMCOPY_to_guest | HVMCOPY_linear,
-                      PFEC_page_present | PFEC_write_access | pfec, pfinfo);
+                      PFEC_page_present | PFEC_write_access | pfec,
+                      pfinfo, NULL);
 }
 
 enum hvm_translation_result hvm_copy_from_guest_linear(
     void *buf, unsigned long addr, int size, uint32_t pfec,
-    pagefault_info_t *pfinfo)
+    pagefault_info_t *pfinfo, struct hvmemul_cache *cache)
 {
     return __hvm_copy(buf, addr, size, current,
                       HVMCOPY_from_guest | HVMCOPY_linear,
-                      PFEC_page_present | pfec, pfinfo);
+                      PFEC_page_present | pfec, pfinfo, cache);
 }
 
 unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len)
@@ -3308,7 +3323,8 @@ unsigned long copy_from_user_hvm(void *t
         return 0;
     }
 
-    rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL);
+    rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len,
+                                    0, NULL, NULL);
     return rc ? len : 0; /* fake a copy_from_user() return code */
 }
 
@@ -3724,7 +3740,7 @@ void hvm_ud_intercept(struct cpu_user_re
                                         sizeof(sig), hvm_access_insn_fetch,
                                         cs, &addr) &&
              (hvm_copy_from_guest_linear(sig, addr, sizeof(sig),
-                                         walk, NULL) == HVMTRANS_okay) &&
+                                         walk, NULL, NULL) == HVMTRANS_okay) &&
              (memcmp(sig, "\xf\xbxen", sizeof(sig)) == 0) )
         {
             regs->rip += sizeof(sig);
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1358,7 +1358,7 @@ static void svm_emul_swint_injection(str
         goto raise_exception;
 
     rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size,
-                                    PFEC_implicit, &pfinfo);
+                                    PFEC_implicit, &pfinfo, NULL);
     if ( rc )
     {
         if ( rc == HVMTRANS_bad_linear_to_gfn )
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -475,7 +475,7 @@ static int decode_vmx_inst(struct cpu_us
         {
             pagefault_info_t pfinfo;
             int rc = hvm_copy_from_guest_linear(poperandS, base, size,
-                                                0, &pfinfo);
+                                                0, &pfinfo, NULL);
 
             if ( rc == HVMTRANS_bad_linear_to_gfn )
                 hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -166,7 +166,7 @@ const struct x86_emulate_ops *shadow_ini
             hvm_access_insn_fetch, sh_ctxt, &addr) &&
          !hvm_copy_from_guest_linear(
              sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf),
-             PFEC_insn_fetch, NULL))
+             PFEC_insn_fetch, NULL, NULL))
         ? sizeof(sh_ctxt->insn_buf) : 0;
 
     return &hvm_shadow_emulator_ops;
@@ -201,7 +201,7 @@ void shadow_continue_emulation(struct sh
                 hvm_access_insn_fetch, sh_ctxt, &addr) &&
              !hvm_copy_from_guest_linear(
                  sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf),
-                 PFEC_insn_fetch, NULL))
+                 PFEC_insn_fetch, NULL, NULL))
             ? sizeof(sh_ctxt->insn_buf) : 0;
         sh_ctxt->insn_buf_eip = regs->rip;
     }
--- a/xen/arch/x86/mm/shadow/hvm.c
+++ b/xen/arch/x86/mm/shadow/hvm.c
@@ -125,7 +125,7 @@ hvm_read(enum x86_segment seg,
     rc = hvm_copy_from_guest_linear(p_data, addr, bytes,
                                     (access_type == hvm_access_insn_fetch
                                      ? PFEC_insn_fetch : 0),
-                                    &pfinfo);
+                                    &pfinfo, NULL);
 
     switch ( rc )
     {
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -99,6 +99,11 @@ int hvmemul_do_pio_buffer(uint16_t port,
                           void *buffer);
 
 struct hvmemul_cache;
+struct hvmemul_cache *hvmemul_cache_init(unsigned int nents);
+static inline void hvmemul_cache_destroy(struct hvmemul_cache *cache)
+{
+    xfree(cache);
+}
 bool hvmemul_read_cache(const struct hvmemul_cache *, paddr_t gpa,
                         unsigned int level, void *buffer, unsigned int size);
 void hvmemul_write_cache(struct hvmemul_cache *, paddr_t gpa,
--- a/xen/include/asm-x86/hvm/support.h
+++ b/xen/include/asm-x86/hvm/support.h
@@ -99,7 +99,7 @@ enum hvm_translation_result hvm_copy_to_
     pagefault_info_t *pfinfo);
 enum hvm_translation_result hvm_copy_from_guest_linear(
     void *buf, unsigned long addr, int size, uint32_t pfec,
-    pagefault_info_t *pfinfo);
+    pagefault_info_t *pfinfo, struct hvmemul_cache *cache);
 
 /*
  * Get a reference on the page under an HVM physical or linear address.  If
@@ -110,7 +110,7 @@ enum hvm_translation_result hvm_copy_fro
 enum hvm_translation_result hvm_translate_get_page(
     struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
     pagefault_info_t *pfinfo, struct page_info **page_p,
-    gfn_t *gfn_p, p2m_type_t *p2mt_p);
+    gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache);
 
 #define HVM_HCALL_completed  0 /* hypercall completed - no further action */
 #define HVM_HCALL_preempted  1 /* hypercall preempted - re-execute VMCALL */
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -53,8 +53,6 @@ struct hvm_mmio_cache {
     uint8_t buffer[32];
 };
 
-struct hvmemul_cache;
-
 struct hvm_vcpu_io {
     /* I/O request in flight to device model. */
     enum hvm_io_completion io_completion;
@@ -200,6 +198,7 @@ struct hvm_vcpu {
     u8                  cache_mode;
 
     struct hvm_vcpu_io  hvm_io;
+    struct hvmemul_cache *data_cache;
 
     /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */
     struct x86_event     inject_event;




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.