Xen project Mailing List

[Xen-devel] [PATCH v3 3/4] x86/HVM: implement memory read caching

Emulation requiring device model assistance uses a form of instruction re-execution, assuming that the second (and any further) pass takes exactly the same path. This is a valid assumption as far use of CPU registers goes (as those can't change without any other instruction executing in between), but is wrong for memory accesses. In particular it has been observed that Windows might page out buffers underneath an instruction currently under emulation (hitting between two passes). If the first pass translated a linear address successfully, any subsequent pass needs to do so too, yielding the exact same translation. Introduce a cache (used by just guest page table accesses for now) to make sure above described assumption holds. This is a very simplistic implementation for now: Only exact matches are satisfied (no overlaps or partial reads or anything). As to the actual data page in this scenario, there are a couple of aspects to take into consideration: - We must be talking about an insn accessing two locations (two memory ones, one of which is MMIO, or a memory and an I/O one). - If the non I/O / MMIO side is being read, the re-read (if it occurs at all) is having its result discarded, by taking the shortcut through the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note how, among all the re-issue sanity checks there, we avoid comparing the actual data. - If the non I/O / MMIO side is being written, it is the OSes responsibility to avoid actually moving page contents to disk while there might still be a write access in flight - this is no different in behavior from bare hardware. - Read-modify-write accesses are, as always, complicated, and while we deal with them better nowadays than we did in the past, we're still not quite there to guarantee hardware like behavior in all cases anyway. Nothing is getting worse by the changes made here, afaict. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Acked-by: Tim Deegan <tim@xxxxxxx> Reviewed-by: Paul Durrant <paul.durrant@xxxxxxxxxx> --- v3: Add text about the actual data page to the description. v2: Re-base. --- a/xen/arch/x86/hvm/emulate.c +++ b/xen/arch/x86/hvm/emulate.c @@ -27,6 +27,18 @@ #include <asm/hvm/svm/svm.h> #include <asm/vm_event.h> +struct hvmemul_cache +{ + unsigned int max_ents; + unsigned int num_ents; + struct { + paddr_t gpa:PADDR_BITS; + unsigned int size:(BITS_PER_LONG - PADDR_BITS) / 2; + unsigned int level:(BITS_PER_LONG - PADDR_BITS) / 2; + unsigned long data; + } ents[]; +}; + static void hvmtrace_io_assist(const ioreq_t *p) { unsigned int size, event; @@ -541,7 +553,7 @@ static int hvmemul_do_mmio_addr(paddr_t */ static void *hvmemul_map_linear_addr( unsigned long linear, unsigned int bytes, uint32_t pfec, - struct hvm_emulate_ctxt *hvmemul_ctxt) + struct hvm_emulate_ctxt *hvmemul_ctxt, struct hvmemul_cache *cache) { struct vcpu *curr = current; void *err, *mapping; @@ -586,7 +598,7 @@ static void *hvmemul_map_linear_addr( ASSERT(mfn_x(*mfn) == 0); res = hvm_translate_get_page(curr, addr, true, pfec, - &pfinfo, &page, NULL, &p2mt); + &pfinfo, &page, NULL, &p2mt, cache); switch ( res ) { @@ -702,6 +714,8 @@ static int hvmemul_linear_to_phys( gfn_t gfn, ngfn; unsigned long done, todo, i, offset = addr & ~PAGE_MASK; int reverse; + struct hvmemul_cache *cache = pfec & PFEC_insn_fetch + ? NULL : curr->arch.hvm.data_cache; /* * Clip repetitions to a sensible maximum. This avoids extensive looping in @@ -731,7 +745,7 @@ static int hvmemul_linear_to_phys( return rc; gfn = gaddr_to_gfn(gaddr); } - else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, NULL), + else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, cache), INVALID_GFN) ) { if ( pfec & (PFEC_page_paged | PFEC_page_shared) ) @@ -747,7 +761,7 @@ static int hvmemul_linear_to_phys( { /* Get the next PFN in the range. */ addr += reverse ? -PAGE_SIZE : PAGE_SIZE; - ngfn = paging_gla_to_gfn(curr, addr, &pfec, NULL); + ngfn = paging_gla_to_gfn(curr, addr, &pfec, cache); /* Is it contiguous with the preceding PFNs? If not then we're done. */ if ( gfn_eq(ngfn, INVALID_GFN) || @@ -1073,7 +1087,10 @@ static int linear_read(unsigned long add uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt) { pagefault_info_t pfinfo; - int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo); + int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo, + (pfec & PFEC_insn_fetch + ? NULL + : current->arch.hvm.data_cache)); switch ( rc ) { @@ -1270,7 +1287,8 @@ static int hvmemul_write( if ( !known_gla(addr, bytes, pfec) ) { - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, + current->arch.hvm.data_cache); if ( IS_ERR(mapping) ) return ~PTR_ERR(mapping); } @@ -1312,7 +1330,8 @@ static int hvmemul_rmw( if ( !known_gla(addr, bytes, pfec) ) { - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, + current->arch.hvm.data_cache); if ( IS_ERR(mapping) ) return ~PTR_ERR(mapping); } @@ -1466,7 +1485,8 @@ static int hvmemul_cmpxchg( else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 ) pfec |= PFEC_user_mode; - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, + curr->arch.hvm.data_cache); if ( IS_ERR(mapping) ) return ~PTR_ERR(mapping); @@ -2373,6 +2393,7 @@ static int _hvm_emulate_one(struct hvm_e { vio->mmio_cache_count = 0; vio->mmio_insn_bytes = 0; + curr->arch.hvm.data_cache->num_ents = 0; } else { @@ -2591,7 +2612,7 @@ void hvm_emulate_init_per_insn( &addr) && hvm_copy_from_guest_linear(hvmemul_ctxt->insn_buf, addr, sizeof(hvmemul_ctxt->insn_buf), - pfec | PFEC_insn_fetch, + pfec | PFEC_insn_fetch, NULL, NULL) == HVMTRANS_okay) ? sizeof(hvmemul_ctxt->insn_buf) : 0; } @@ -2664,9 +2685,35 @@ void hvm_dump_emulation_state(const char hvmemul_ctxt->insn_buf); } +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents) +{ + struct hvmemul_cache *cache = xmalloc_bytes(offsetof(struct hvmemul_cache, + ents[nents])); + + if ( cache ) + { + cache->num_ents = 0; + cache->max_ents = nents; + } + + return cache; +} + bool hvmemul_read_cache(const struct hvmemul_cache *cache, paddr_t gpa, unsigned int level, void *buffer, unsigned int size) { + unsigned int i; + + ASSERT(size <= sizeof(cache->ents->data)); + + for ( i = 0; i < cache->num_ents; ++i ) + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa && + cache->ents[i].size == size ) + { + memcpy(buffer, &cache->ents[i].data, size); + return true; + } + return false; } @@ -2674,6 +2721,35 @@ void hvmemul_write_cache(struct hvmemul_ unsigned int level, const void *buffer, unsigned int size) { + unsigned int i; + + if ( size > sizeof(cache->ents->data) ) + { + ASSERT_UNREACHABLE(); + return; + } + + for ( i = 0; i < cache->num_ents; ++i ) + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa && + cache->ents[i].size == size ) + { + memcpy(&cache->ents[i].data, buffer, size); + return; + } + + if ( unlikely(i >= cache->max_ents) ) + { + ASSERT_UNREACHABLE(); + return; + } + + cache->ents[i].level = level; + cache->ents[i].gpa = gpa; + cache->ents[i].size = size; + + memcpy(&cache->ents[i].data, buffer, size); + + cache->num_ents = i + 1; } /* --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -1498,6 +1498,17 @@ int hvm_vcpu_initialise(struct vcpu *v) v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET; + /* + * Leaving aside the insn fetch, for which we don't use this cache, no + * insn can access more than 8 independent linear addresses (AVX2 + * gathers being the worst). Each such linear range can span a page + * boundary, i.e. require two page walks. + */ + v->arch.hvm.data_cache = hvmemul_cache_init(CONFIG_PAGING_LEVELS * 8 * 2); + rc = -ENOMEM; + if ( !v->arch.hvm.data_cache ) + goto fail4; + rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */ if ( rc != 0 ) goto fail4; @@ -1527,6 +1538,7 @@ int hvm_vcpu_initialise(struct vcpu *v) fail5: free_compat_arg_xlat(v); fail4: + hvmemul_cache_destroy(v->arch.hvm.data_cache); hvm_funcs.vcpu_destroy(v); fail3: vlapic_destroy(v); @@ -1549,6 +1561,8 @@ void hvm_vcpu_destroy(struct vcpu *v) free_compat_arg_xlat(v); + hvmemul_cache_destroy(v->arch.hvm.data_cache); + tasklet_kill(&v->arch.hvm.assert_evtchn_irq_tasklet); hvm_funcs.vcpu_destroy(v); @@ -2923,7 +2937,7 @@ void hvm_task_switch( } rc = hvm_copy_from_guest_linear( - &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo); + &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL); if ( rc == HVMTRANS_bad_linear_to_gfn ) hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); if ( rc != HVMTRANS_okay ) @@ -2970,7 +2984,7 @@ void hvm_task_switch( goto out; rc = hvm_copy_from_guest_linear( - &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo); + &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL); if ( rc == HVMTRANS_bad_linear_to_gfn ) hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); /* @@ -3081,7 +3095,7 @@ void hvm_task_switch( enum hvm_translation_result hvm_translate_get_page( struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, pagefault_info_t *pfinfo, struct page_info **page_p, - gfn_t *gfn_p, p2m_type_t *p2mt_p) + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache) { struct page_info *page; p2m_type_t p2mt; @@ -3089,7 +3103,7 @@ enum hvm_translation_result hvm_translat if ( linear ) { - gfn = paging_gla_to_gfn(v, addr, &pfec, NULL); + gfn = paging_gla_to_gfn(v, addr, &pfec, cache); if ( gfn_eq(gfn, INVALID_GFN) ) { @@ -3161,7 +3175,7 @@ enum hvm_translation_result hvm_translat #define HVMCOPY_linear (1u<<2) static enum hvm_translation_result __hvm_copy( void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags, - uint32_t pfec, pagefault_info_t *pfinfo) + uint32_t pfec, pagefault_info_t *pfinfo, struct hvmemul_cache *cache) { gfn_t gfn; struct page_info *page; @@ -3194,8 +3208,8 @@ static enum hvm_translation_result __hvm count = min_t(int, PAGE_SIZE - gpa, todo); - res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, - pfec, pfinfo, &page, &gfn, &p2mt); + res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec, + pfinfo, &page, &gfn, &p2mt, cache); if ( res != HVMTRANS_okay ) return res; @@ -3242,14 +3256,14 @@ enum hvm_translation_result hvm_copy_to_ paddr_t paddr, void *buf, int size, struct vcpu *v) { return __hvm_copy(buf, paddr, size, v, - HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL); + HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL, NULL); } enum hvm_translation_result hvm_copy_from_guest_phys( void *buf, paddr_t paddr, int size) { return __hvm_copy(buf, paddr, size, current, - HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL); + HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL, NULL); } enum hvm_translation_result hvm_copy_to_guest_linear( @@ -3258,16 +3272,17 @@ enum hvm_translation_result hvm_copy_to_ { return __hvm_copy(buf, addr, size, current, HVMCOPY_to_guest | HVMCOPY_linear, - PFEC_page_present | PFEC_write_access | pfec, pfinfo); + PFEC_page_present | PFEC_write_access | pfec, + pfinfo, NULL); } enum hvm_translation_result hvm_copy_from_guest_linear( void *buf, unsigned long addr, int size, uint32_t pfec, - pagefault_info_t *pfinfo) + pagefault_info_t *pfinfo, struct hvmemul_cache *cache) { return __hvm_copy(buf, addr, size, current, HVMCOPY_from_guest | HVMCOPY_linear, - PFEC_page_present | pfec, pfinfo); + PFEC_page_present | pfec, pfinfo, cache); } unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len) @@ -3308,7 +3323,8 @@ unsigned long copy_from_user_hvm(void *t return 0; } - rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL); + rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, + 0, NULL, NULL); return rc ? len : 0; /* fake a copy_from_user() return code */ } @@ -3724,7 +3740,7 @@ void hvm_ud_intercept(struct cpu_user_re sizeof(sig), hvm_access_insn_fetch, cs, &addr) && (hvm_copy_from_guest_linear(sig, addr, sizeof(sig), - walk, NULL) == HVMTRANS_okay) && + walk, NULL, NULL) == HVMTRANS_okay) && (memcmp(sig, "\xf\xbxen", sizeof(sig)) == 0) ) { regs->rip += sizeof(sig); --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -1358,7 +1358,7 @@ static void svm_emul_swint_injection(str goto raise_exception; rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size, - PFEC_implicit, &pfinfo); + PFEC_implicit, &pfinfo, NULL); if ( rc ) { if ( rc == HVMTRANS_bad_linear_to_gfn ) --- a/xen/arch/x86/hvm/vmx/vvmx.c +++ b/xen/arch/x86/hvm/vmx/vvmx.c @@ -475,7 +475,7 @@ static int decode_vmx_inst(struct cpu_us { pagefault_info_t pfinfo; int rc = hvm_copy_from_guest_linear(poperandS, base, size, - 0, &pfinfo); + 0, &pfinfo, NULL); if ( rc == HVMTRANS_bad_linear_to_gfn ) hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); --- a/xen/arch/x86/mm/shadow/common.c +++ b/xen/arch/x86/mm/shadow/common.c @@ -166,7 +166,7 @@ const struct x86_emulate_ops *shadow_ini hvm_access_insn_fetch, sh_ctxt, &addr) && !hvm_copy_from_guest_linear( sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), - PFEC_insn_fetch, NULL)) + PFEC_insn_fetch, NULL, NULL)) ? sizeof(sh_ctxt->insn_buf) : 0; return &hvm_shadow_emulator_ops; @@ -201,7 +201,7 @@ void shadow_continue_emulation(struct sh hvm_access_insn_fetch, sh_ctxt, &addr) && !hvm_copy_from_guest_linear( sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), - PFEC_insn_fetch, NULL)) + PFEC_insn_fetch, NULL, NULL)) ? sizeof(sh_ctxt->insn_buf) : 0; sh_ctxt->insn_buf_eip = regs->rip; } --- a/xen/arch/x86/mm/shadow/hvm.c +++ b/xen/arch/x86/mm/shadow/hvm.c @@ -125,7 +125,7 @@ hvm_read(enum x86_segment seg, rc = hvm_copy_from_guest_linear(p_data, addr, bytes, (access_type == hvm_access_insn_fetch ? PFEC_insn_fetch : 0), - &pfinfo); + &pfinfo, NULL); switch ( rc ) { --- a/xen/include/asm-x86/hvm/emulate.h +++ b/xen/include/asm-x86/hvm/emulate.h @@ -99,6 +99,11 @@ int hvmemul_do_pio_buffer(uint16_t port, void *buffer); struct hvmemul_cache; +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents); +static inline void hvmemul_cache_destroy(struct hvmemul_cache *cache) +{ + xfree(cache); +} bool hvmemul_read_cache(const struct hvmemul_cache *, paddr_t gpa, unsigned int level, void *buffer, unsigned int size); void hvmemul_write_cache(struct hvmemul_cache *, paddr_t gpa, --- a/xen/include/asm-x86/hvm/support.h +++ b/xen/include/asm-x86/hvm/support.h @@ -99,7 +99,7 @@ enum hvm_translation_result hvm_copy_to_ pagefault_info_t *pfinfo); enum hvm_translation_result hvm_copy_from_guest_linear( void *buf, unsigned long addr, int size, uint32_t pfec, - pagefault_info_t *pfinfo); + pagefault_info_t *pfinfo, struct hvmemul_cache *cache); /* * Get a reference on the page under an HVM physical or linear address. If @@ -110,7 +110,7 @@ enum hvm_translation_result hvm_copy_fro enum hvm_translation_result hvm_translate_get_page( struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, pagefault_info_t *pfinfo, struct page_info **page_p, - gfn_t *gfn_p, p2m_type_t *p2mt_p); + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache); #define HVM_HCALL_completed 0 /* hypercall completed - no further action */ #define HVM_HCALL_preempted 1 /* hypercall preempted - re-execute VMCALL */ --- a/xen/include/asm-x86/hvm/vcpu.h +++ b/xen/include/asm-x86/hvm/vcpu.h @@ -53,8 +53,6 @@ struct hvm_mmio_cache { uint8_t buffer[32]; }; -struct hvmemul_cache; - struct hvm_vcpu_io { /* I/O request in flight to device model. */ enum hvm_io_completion io_completion; @@ -200,6 +198,7 @@ struct hvm_vcpu { u8 cache_mode; struct hvm_vcpu_io hvm_io; + struct hvmemul_cache *data_cache; /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */ struct x86_event inject_event; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.