Xen project Mailing List

Re: [Xen-devel] [PATCH v2 3/4] x86/HVM: implement memory read caching

To: 'Jan Beulich' <JBeulich@xxxxxxxx>, xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxxx>

From: Paul Durrant <Paul.Durrant@xxxxxxxxxx>

Date: Tue, 11 Sep 2018 16:20:44 +0000

Accept-language: en-GB, en-US

Cc: Andrew Cooper <Andrew.Cooper3@xxxxxxxxxx>, Wei Liu <wei.liu2@xxxxxxxxxx>, George Dunlap <George.Dunlap@xxxxxxxxxx>

Delivery-date: Tue, 11 Sep 2018 16:20:56 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Thread-index: AQHUSdGBEz76lmszFUGzFYvYI0tY2aTrQqsw

Thread-topic: [PATCH v2 3/4] x86/HVM: implement memory read caching

> -----Original Message----- > From: Jan Beulich [mailto:JBeulich@xxxxxxxx] > Sent: 11 September 2018 14:15 > To: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxxx> > Cc: Andrew Cooper <Andrew.Cooper3@xxxxxxxxxx>; Paul Durrant > <Paul.Durrant@xxxxxxxxxx>; Wei Liu <wei.liu2@xxxxxxxxxx>; George Dunlap > <George.Dunlap@xxxxxxxxxx> > Subject: [PATCH v2 3/4] x86/HVM: implement memory read caching > > Emulation requiring device model assistance uses a form of instruction > re-execution, assuming that the second (and any further) pass takes > exactly the same path. This is a valid assumption as far use of CPU > registers goes (as those can't change without any other instruction > executing in between), but is wrong for memory accesses. In particular > it has been observed that Windows might page out buffers underneath an > instruction currently under emulation (hitting between two passes). If > the first pass translated a linear address successfully, any subsequent > pass needs to do so too, yielding the exact same translation. > > Introduce a cache (used by just guest page table accesses for now) to > make sure above described assumption holds. This is a very simplistic > implementation for now: Only exact matches are satisfied (no overlaps or > partial reads or anything). > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> > Acked-by: Tim Deegan <tim@xxxxxxx> > --- > v2: Re-base. > > --- a/xen/arch/x86/hvm/emulate.c > +++ b/xen/arch/x86/hvm/emulate.c > @@ -27,6 +27,18 @@ > #include <asm/hvm/svm/svm.h> > #include <asm/vm_event.h> > > +struct hvmemul_cache > +{ > + unsigned int max_ents; > + unsigned int num_ents; > + struct { > + paddr_t gpa:PADDR_BITS; > + unsigned int size:(BITS_PER_LONG - PADDR_BITS) / 2; > + unsigned int level:(BITS_PER_LONG - PADDR_BITS) / 2; > + unsigned long data; > + } ents[]; > +}; > + > static void hvmtrace_io_assist(const ioreq_t *p) > { > unsigned int size, event; > @@ -541,7 +553,7 @@ static int hvmemul_do_mmio_addr(paddr_t > */ > static void *hvmemul_map_linear_addr( > unsigned long linear, unsigned int bytes, uint32_t pfec, > - struct hvm_emulate_ctxt *hvmemul_ctxt) > + struct hvm_emulate_ctxt *hvmemul_ctxt, struct hvmemul_cache > *cache) > { > struct vcpu *curr = current; > void *err, *mapping; > @@ -586,7 +598,7 @@ static void *hvmemul_map_linear_addr( > ASSERT(mfn_x(*mfn) == 0); > > res = hvm_translate_get_page(curr, addr, true, pfec, > - &pfinfo, &page, NULL, &p2mt); > + &pfinfo, &page, NULL, &p2mt, cache); > > switch ( res ) > { > @@ -702,6 +714,8 @@ static int hvmemul_linear_to_phys( > gfn_t gfn, ngfn; > unsigned long done, todo, i, offset = addr & ~PAGE_MASK; > int reverse; > + struct hvmemul_cache *cache = pfec & PFEC_insn_fetch > + ? NULL : curr->arch.hvm.data_cache; > > /* > * Clip repetitions to a sensible maximum. This avoids extensive looping > in > @@ -731,7 +745,7 @@ static int hvmemul_linear_to_phys( > return rc; > gfn = gaddr_to_gfn(gaddr); > } > - else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, NULL), > + else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, cache), > INVALID_GFN) ) > { > if ( pfec & (PFEC_page_paged | PFEC_page_shared) ) > @@ -747,7 +761,7 @@ static int hvmemul_linear_to_phys( > { > /* Get the next PFN in the range. */ > addr += reverse ? -PAGE_SIZE : PAGE_SIZE; > - ngfn = paging_gla_to_gfn(curr, addr, &pfec, NULL); > + ngfn = paging_gla_to_gfn(curr, addr, &pfec, cache); > > /* Is it contiguous with the preceding PFNs? If not then we're done. > */ > if ( gfn_eq(ngfn, INVALID_GFN) || > @@ -1073,7 +1087,10 @@ static int linear_read(unsigned long add > uint32_t pfec, struct hvm_emulate_ctxt *hvmemul_ctxt) > { > pagefault_info_t pfinfo; > - int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo); > + int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo, > + (pfec & PFEC_insn_fetch > + ? NULL > + : current->arch.hvm.data_cache)); > > switch ( rc ) > { > @@ -1270,7 +1287,8 @@ static int hvmemul_write( > > if ( !known_gla(addr, bytes, pfec) ) > { > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, > hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, > hvmemul_ctxt, > + current->arch.hvm.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > } > @@ -1312,7 +1330,8 @@ static int hvmemul_rmw( > > if ( !known_gla(addr, bytes, pfec) ) > { > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, > hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, > hvmemul_ctxt, > + current->arch.hvm.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > } > @@ -1466,7 +1485,8 @@ static int hvmemul_cmpxchg( > else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 ) > pfec |= PFEC_user_mode; > > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, > + curr->arch.hvm.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > > @@ -2373,6 +2393,7 @@ static int _hvm_emulate_one(struct hvm_e > { > vio->mmio_cache_count = 0; > vio->mmio_insn_bytes = 0; > + curr->arch.hvm.data_cache->num_ents = 0; > } > else > { > @@ -2591,7 +2612,7 @@ void hvm_emulate_init_per_insn( > &addr) && > hvm_copy_from_guest_linear(hvmemul_ctxt->insn_buf, addr, > sizeof(hvmemul_ctxt->insn_buf), > - pfec | PFEC_insn_fetch, > + pfec | PFEC_insn_fetch, NULL, > NULL) == HVMTRANS_okay) ? > sizeof(hvmemul_ctxt->insn_buf) : 0; > } > @@ -2664,9 +2685,35 @@ void hvm_dump_emulation_state(const char > hvmemul_ctxt->insn_buf); > } > > +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents) > +{ > + struct hvmemul_cache *cache = xmalloc_bytes(offsetof(struct > hvmemul_cache, > + ents[nents])); > + > + if ( cache ) > + { > + cache->num_ents = 0; > + cache->max_ents = nents; > + } > + > + return cache; > +} > + > bool hvmemul_read_cache(const struct hvmemul_cache *cache, paddr_t > gpa, > unsigned int level, void *buffer, unsigned int size) > { > + unsigned int i; > + Here you could return false if cache is NULL... > + ASSERT(size <= sizeof(cache->ents->data)); > + > + for ( i = 0; i < cache->num_ents; ++i ) > + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa && > + cache->ents[i].size == size ) > + { > + memcpy(buffer, &cache->ents[i].data, size); > + return true; > + } > + > return false; > } > > @@ -2674,6 +2721,35 @@ void hvmemul_write_cache(struct hvmemul_ > unsigned int level, const void *buffer, > unsigned int size) > { > + unsigned int i; > + ...and here just bail out. Thus making both functions safe to call with a NULL cache. > + if ( size > sizeof(cache->ents->data) ) > + { > + ASSERT_UNREACHABLE(); > + return; > + } > + > + for ( i = 0; i < cache->num_ents; ++i ) > + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa && > + cache->ents[i].size == size ) > + { > + memcpy(&cache->ents[i].data, buffer, size); > + return; > + } > + > + if ( unlikely(i >= cache->max_ents) ) > + { > + ASSERT_UNREACHABLE(); > + return; > + } > + > + cache->ents[i].level = level; > + cache->ents[i].gpa = gpa; > + cache->ents[i].size = size; > + > + memcpy(&cache->ents[i].data, buffer, size); > + > + cache->num_ents = i + 1; > } > > /* > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -1521,6 +1521,17 @@ int hvm_vcpu_initialise(struct vcpu *v) > > v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET; > > + /* > + * Leaving aside the insn fetch, for which we don't use this cache, no > + * insn can access more than 8 independent linear addresses (AVX2 > + * gathers being the worst). Each such linear range can span a page > + * boundary, i.e. require two page walks. > + */ > + v->arch.hvm.data_cache = > hvmemul_cache_init(CONFIG_PAGING_LEVELS * 8 * 2); > + rc = -ENOMEM; > + if ( !v->arch.hvm.data_cache ) > + goto fail4; > + > rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */ > if ( rc != 0 ) > goto fail4; > @@ -1550,6 +1561,7 @@ int hvm_vcpu_initialise(struct vcpu *v) > fail5: > free_compat_arg_xlat(v); > fail4: > + hvmemul_cache_destroy(v->arch.hvm.data_cache); > hvm_funcs.vcpu_destroy(v); > fail3: > vlapic_destroy(v); > @@ -1572,6 +1584,8 @@ void hvm_vcpu_destroy(struct vcpu *v) > > free_compat_arg_xlat(v); > > + hvmemul_cache_destroy(v->arch.hvm.data_cache); > + > tasklet_kill(&v->arch.hvm.assert_evtchn_irq_tasklet); > hvm_funcs.vcpu_destroy(v); > > @@ -2946,7 +2960,7 @@ void hvm_task_switch( > } > > rc = hvm_copy_from_guest_linear( > - &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo); > + &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL); > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > if ( rc != HVMTRANS_okay ) > @@ -2993,7 +3007,7 @@ void hvm_task_switch( > goto out; > > rc = hvm_copy_from_guest_linear( > - &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo); > + &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL); > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > /* > @@ -3104,7 +3118,7 @@ void hvm_task_switch( > enum hvm_translation_result hvm_translate_get_page( > struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, > pagefault_info_t *pfinfo, struct page_info **page_p, > - gfn_t *gfn_p, p2m_type_t *p2mt_p) > + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache) > { > struct page_info *page; > p2m_type_t p2mt; > @@ -3112,7 +3126,7 @@ enum hvm_translation_result hvm_translat > > if ( linear ) > { > - gfn = paging_gla_to_gfn(v, addr, &pfec, NULL); > + gfn = paging_gla_to_gfn(v, addr, &pfec, cache); > > if ( gfn_eq(gfn, INVALID_GFN) ) > { > @@ -3184,7 +3198,7 @@ enum hvm_translation_result hvm_translat > #define HVMCOPY_linear (1u<<2) > static enum hvm_translation_result __hvm_copy( > void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags, > - uint32_t pfec, pagefault_info_t *pfinfo) > + uint32_t pfec, pagefault_info_t *pfinfo, struct hvmemul_cache *cache) > { > gfn_t gfn; > struct page_info *page; > @@ -3217,8 +3231,8 @@ static enum hvm_translation_result __hvm > > count = min_t(int, PAGE_SIZE - gpa, todo); > > - res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, > - pfec, pfinfo, &page, &gfn, &p2mt); > + res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec, > + pfinfo, &page, &gfn, &p2mt, cache); > if ( res != HVMTRANS_okay ) > return res; > > @@ -3265,14 +3279,14 @@ enum hvm_translation_result hvm_copy_to_ > paddr_t paddr, void *buf, int size, struct vcpu *v) > { > return __hvm_copy(buf, paddr, size, v, > - HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL); > + HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL, NULL); > } > > enum hvm_translation_result hvm_copy_from_guest_phys( > void *buf, paddr_t paddr, int size) > { > return __hvm_copy(buf, paddr, size, current, > - HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL); > + HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL, NULL); > } > > enum hvm_translation_result hvm_copy_to_guest_linear( > @@ -3281,16 +3295,17 @@ enum hvm_translation_result hvm_copy_to_ > { > return __hvm_copy(buf, addr, size, current, > HVMCOPY_to_guest | HVMCOPY_linear, > - PFEC_page_present | PFEC_write_access | pfec, pfinfo); > + PFEC_page_present | PFEC_write_access | pfec, > + pfinfo, NULL); > } > > enum hvm_translation_result hvm_copy_from_guest_linear( > void *buf, unsigned long addr, int size, uint32_t pfec, > - pagefault_info_t *pfinfo) > + pagefault_info_t *pfinfo, struct hvmemul_cache *cache) > { > return __hvm_copy(buf, addr, size, current, > HVMCOPY_from_guest | HVMCOPY_linear, > - PFEC_page_present | pfec, pfinfo); > + PFEC_page_present | pfec, pfinfo, cache); > } > > unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int > len) > @@ -3331,7 +3346,8 @@ unsigned long copy_from_user_hvm(void *t > return 0; > } > > - rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL); > + rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, > + 0, NULL, NULL); > return rc ? len : 0; /* fake a copy_from_user() return code */ > } > > @@ -3747,7 +3763,7 @@ void hvm_ud_intercept(struct cpu_user_re > sizeof(sig), hvm_access_insn_fetch, > cs, &addr) && > (hvm_copy_from_guest_linear(sig, addr, sizeof(sig), > - walk, NULL) == HVMTRANS_okay) && > + walk, NULL, NULL) == HVMTRANS_okay) > && > (memcmp(sig, "\xf\xbxen", sizeof(sig)) == 0) ) > { > regs->rip += sizeof(sig); > --- a/xen/arch/x86/hvm/svm/svm.c > +++ b/xen/arch/x86/hvm/svm/svm.c > @@ -1358,7 +1358,7 @@ static void svm_emul_swint_injection(str > goto raise_exception; > > rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size, > - PFEC_implicit, &pfinfo); > + PFEC_implicit, &pfinfo, NULL); > if ( rc ) > { > if ( rc == HVMTRANS_bad_linear_to_gfn ) > --- a/xen/arch/x86/hvm/vmx/vvmx.c > +++ b/xen/arch/x86/hvm/vmx/vvmx.c > @@ -475,7 +475,7 @@ static int decode_vmx_inst(struct cpu_us > { > pagefault_info_t pfinfo; > int rc = hvm_copy_from_guest_linear(poperandS, base, size, > - 0, &pfinfo); > + 0, &pfinfo, NULL); > > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > --- a/xen/arch/x86/mm/shadow/common.c > +++ b/xen/arch/x86/mm/shadow/common.c > @@ -166,7 +166,7 @@ const struct x86_emulate_ops *shadow_ini > hvm_access_insn_fetch, sh_ctxt, &addr) && > !hvm_copy_from_guest_linear( > sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), > - PFEC_insn_fetch, NULL)) > + PFEC_insn_fetch, NULL, NULL)) > ? sizeof(sh_ctxt->insn_buf) : 0; > > return &hvm_shadow_emulator_ops; > @@ -201,7 +201,7 @@ void shadow_continue_emulation(struct sh > hvm_access_insn_fetch, sh_ctxt, &addr) && > !hvm_copy_from_guest_linear( > sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), > - PFEC_insn_fetch, NULL)) > + PFEC_insn_fetch, NULL, NULL)) > ? sizeof(sh_ctxt->insn_buf) : 0; > sh_ctxt->insn_buf_eip = regs->rip; > } > --- a/xen/arch/x86/mm/shadow/hvm.c > +++ b/xen/arch/x86/mm/shadow/hvm.c > @@ -125,7 +125,7 @@ hvm_read(enum x86_segment seg, > rc = hvm_copy_from_guest_linear(p_data, addr, bytes, > (access_type == hvm_access_insn_fetch > ? PFEC_insn_fetch : 0), > - &pfinfo); > + &pfinfo, NULL); > > switch ( rc ) > { > --- a/xen/include/asm-x86/hvm/emulate.h > +++ b/xen/include/asm-x86/hvm/emulate.h > @@ -99,6 +99,11 @@ int hvmemul_do_pio_buffer(uint16_t port, > void *buffer); > > struct hvmemul_cache; > +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents); > +static inline void hvmemul_cache_destroy(struct hvmemul_cache *cache) > +{ > + xfree(cache); > +} > bool hvmemul_read_cache(const struct hvmemul_cache *, paddr_t gpa, > unsigned int level, void *buffer, unsigned int size); > void hvmemul_write_cache(struct hvmemul_cache *, paddr_t gpa, > --- a/xen/include/asm-x86/hvm/support.h > +++ b/xen/include/asm-x86/hvm/support.h > @@ -99,7 +99,7 @@ enum hvm_translation_result hvm_copy_to_ > pagefault_info_t *pfinfo); > enum hvm_translation_result hvm_copy_from_guest_linear( > void *buf, unsigned long addr, int size, uint32_t pfec, > - pagefault_info_t *pfinfo); > + pagefault_info_t *pfinfo, struct hvmemul_cache *cache); > > /* > * Get a reference on the page under an HVM physical or linear address. If > @@ -110,7 +110,7 @@ enum hvm_translation_result hvm_copy_fro > enum hvm_translation_result hvm_translate_get_page( > struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, > pagefault_info_t *pfinfo, struct page_info **page_p, > - gfn_t *gfn_p, p2m_type_t *p2mt_p); > + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache); > > #define HVM_HCALL_completed 0 /* hypercall completed - no further > action */ > #define HVM_HCALL_preempted 1 /* hypercall preempted - re-execute > VMCALL */ > --- a/xen/include/asm-x86/hvm/vcpu.h > +++ b/xen/include/asm-x86/hvm/vcpu.h > @@ -53,8 +53,6 @@ struct hvm_mmio_cache { > uint8_t buffer[32]; > }; > > -struct hvmemul_cache; > - > struct hvm_vcpu_io { > /* I/O request in flight to device model. */ > enum hvm_io_completion io_completion; > @@ -200,6 +198,7 @@ struct hvm_vcpu { > u8 cache_mode; > > struct hvm_vcpu_io hvm_io; > + struct hvmemul_cache *data_cache; > > /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */ > struct x86_event inject_event; > > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.