Xen project Mailing List

Re: [Xen-devel] [PATCH 3/6] x86/HVM: implement memory read caching

To: 'Jan Beulich' <JBeulich@xxxxxxxx>, xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxxx>

From: Paul Durrant <Paul.Durrant@xxxxxxxxxx>

Date: Thu, 19 Jul 2018 14:20:44 +0000

Accept-language: en-GB, en-US

Cc: Kevin Tian <kevin.tian@xxxxxxxxx>, Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx>, Andrew Cooper <Andrew.Cooper3@xxxxxxxxxx>, "Tim \(Xen.org\)" <tim@xxxxxxx>, George Dunlap <George.Dunlap@xxxxxxxxxx>, Jun Nakajima <jun.nakajima@xxxxxxxxx>, Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>, Brian Woods <brian.woods@xxxxxxx>

Delivery-date: Thu, 19 Jul 2018 14:21:10 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Thread-index: AQHUH04PnOlfLKyYikKfJA1kDnh7gaSWlKsg

Thread-topic: [PATCH 3/6] x86/HVM: implement memory read caching

> -----Original Message----- > From: Jan Beulich [mailto:JBeulich@xxxxxxxx] > Sent: 19 July 2018 11:49 > To: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxxx> > Cc: Brian Woods <brian.woods@xxxxxxx>; Suravee Suthikulpanit > <suravee.suthikulpanit@xxxxxxx>; Andrew Cooper > <Andrew.Cooper3@xxxxxxxxxx>; Paul Durrant <Paul.Durrant@xxxxxxxxxx>; > George Dunlap <George.Dunlap@xxxxxxxxxx>; Jun Nakajima > <jun.nakajima@xxxxxxxxx>; Kevin Tian <kevin.tian@xxxxxxxxx>; Boris > Ostrovsky <boris.ostrovsky@xxxxxxxxxx>; Tim (Xen.org) <tim@xxxxxxx> > Subject: [PATCH 3/6] x86/HVM: implement memory read caching > > Emulation requiring device model assistance uses a form of instruction > re-execution, assuming that the second (and any further) pass takes > exactly the same path. This is a valid assumption as far use of CPU > registers goes (as those can't change without any other instruction > executing in between), but is wrong for memory accesses. In particular > it has been observed that Windows might page out buffers underneath an > instruction currently under emulation (hitting between two passes). If > the first pass translated a linear address successfully, any subsequent > pass needs to do so too, yielding the exact same translation. > > Introduce a cache (used just by guest page table accesses for now) to > make sure above described assumption holds. This is a very simplistic > implementation for now: Only exact matches are satisfied (no overlaps or > partial reads or anything). > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> > > --- a/xen/arch/x86/hvm/emulate.c > +++ b/xen/arch/x86/hvm/emulate.c > @@ -27,6 +27,18 @@ > #include <asm/hvm/svm/svm.h> > #include <asm/vm_event.h> > > +struct hvmemul_cache > +{ > + unsigned int max_ents; > + unsigned int num_ents; > + struct { > + paddr_t gpa:PADDR_BITS; > + unsigned int size:(BITS_PER_LONG - PADDR_BITS) / 2; > + unsigned int level:(BITS_PER_LONG - PADDR_BITS) / 2; > + unsigned long data; > + } ents[]; > +}; > + > static void hvmtrace_io_assist(const ioreq_t *p) > { > unsigned int size, event; > @@ -520,7 +532,7 @@ static int hvmemul_do_mmio_addr(paddr_t > */ > static void *hvmemul_map_linear_addr( > unsigned long linear, unsigned int bytes, uint32_t pfec, > - struct hvm_emulate_ctxt *hvmemul_ctxt) > + struct hvm_emulate_ctxt *hvmemul_ctxt, struct hvmemul_cache > *cache) > { > struct vcpu *curr = current; > void *err, *mapping; > @@ -565,7 +577,7 @@ static void *hvmemul_map_linear_addr( > ASSERT(mfn_x(*mfn) == 0); > > res = hvm_translate_get_page(curr, addr, true, pfec, > - &pfinfo, &page, NULL, &p2mt); > + &pfinfo, &page, NULL, &p2mt, cache); > > switch ( res ) > { > @@ -681,6 +693,8 @@ static int hvmemul_linear_to_phys( > gfn_t gfn, ngfn; > unsigned long done, todo, i, offset = addr & ~PAGE_MASK; > int reverse; > + struct hvmemul_cache *cache = pfec & PFEC_insn_fetch > + ? NULL : curr->arch.hvm_vcpu.data_cache; > > /* > * Clip repetitions to a sensible maximum. This avoids extensive looping > in > @@ -710,7 +724,7 @@ static int hvmemul_linear_to_phys( > return rc; > gfn = gaddr_to_gfn(gaddr); > } > - else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, NULL), > + else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, cache), > INVALID_GFN) ) > { > if ( pfec & (PFEC_page_paged | PFEC_page_shared) ) > @@ -726,7 +740,7 @@ static int hvmemul_linear_to_phys( > { > /* Get the next PFN in the range. */ > addr += reverse ? -PAGE_SIZE : PAGE_SIZE; > - ngfn = paging_gla_to_gfn(curr, addr, &pfec, NULL); > + ngfn = paging_gla_to_gfn(curr, addr, &pfec, cache); > > /* Is it contiguous with the preceding PFNs? If not then we're done. > */ > if ( gfn_eq(ngfn, INVALID_GFN) || > @@ -1046,6 +1060,8 @@ static int __hvmemul_read( > pfec |= PFEC_implicit; > else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 ) > pfec |= PFEC_user_mode; > + if ( access_type == hvm_access_insn_fetch ) > + pfec |= PFEC_insn_fetch; Since you OR the insn_fetch flag in here... > > rc = hvmemul_virtual_to_linear( > seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr); > @@ -1059,7 +1075,8 @@ static int __hvmemul_read( > > rc = ((access_type == hvm_access_insn_fetch) ? > hvm_fetch_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo) : ...could you not just use hvm_copy_from_guest_linear() here regardless of access_type (and just NULL out the cache argument if it is an insn_fetch)? AFAICT the only reason hvm_fetch_from_guest_linear() exists is to OR the extra flag in. Paul > - hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo)); > + hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo, > + curr->arch.hvm_vcpu.data_cache)); > > switch ( rc ) > { > @@ -1178,7 +1195,8 @@ static int hvmemul_write( > (vio->mmio_gla == (addr & PAGE_MASK)) ) > return hvmemul_linear_mmio_write(addr, bytes, p_data, pfec, > hvmemul_ctxt, 1); > > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, > + curr->arch.hvm_vcpu.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > > @@ -1218,7 +1236,8 @@ static int hvmemul_rmw( > else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 ) > pfec |= PFEC_user_mode; > > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, > + current->arch.hvm_vcpu.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > > @@ -1375,7 +1394,8 @@ static int hvmemul_cmpxchg( > else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 ) > pfec |= PFEC_user_mode; > > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, > + curr->arch.hvm_vcpu.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > > @@ -2282,6 +2302,7 @@ static int _hvm_emulate_one(struct hvm_e > { > vio->mmio_cache_count = 0; > vio->mmio_insn_bytes = 0; > + curr->arch.hvm_vcpu.data_cache->num_ents = 0; > } > else > { > @@ -2572,9 +2593,35 @@ void hvm_dump_emulation_state(const char > hvmemul_ctxt->insn_buf); > } > > +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents) > +{ > + struct hvmemul_cache *cache = xmalloc_bytes(offsetof(struct > hvmemul_cache, > + ents[nents])); > + > + if ( cache ) > + { > + cache->num_ents = 0; > + cache->max_ents = nents; > + } > + > + return cache; > +} > + > bool hvmemul_read_cache(const struct hvmemul_cache *cache, paddr_t > gpa, > unsigned int level, void *buffer, unsigned int size) > { > + unsigned int i; > + > + ASSERT(size <= sizeof(cache->ents->data)); > + > + for ( i = 0; i < cache->num_ents; ++i ) > + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa && > + cache->ents[i].size == size ) > + { > + memcpy(buffer, &cache->ents[i].data, size); > + return true; > + } > + > return false; > } > > @@ -2582,6 +2629,35 @@ void hvmemul_write_cache(struct hvmemul_ > unsigned int level, const void *buffer, > unsigned int size) > { > + unsigned int i; > + > + if ( size > sizeof(cache->ents->data) ) > + { > + ASSERT_UNREACHABLE(); > + return; > + } > + > + for ( i = 0; i < cache->num_ents; ++i ) > + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa && > + cache->ents[i].size == size ) > + { > + memcpy(&cache->ents[i].data, buffer, size); > + return; > + } > + > + if ( unlikely(i >= cache->max_ents) ) > + { > + ASSERT_UNREACHABLE(); > + return; > + } > + > + cache->ents[i].level = level; > + cache->ents[i].gpa = gpa; > + cache->ents[i].size = size; > + > + memcpy(&cache->ents[i].data, buffer, size); > + > + cache->num_ents = i + 1; > } > > /* > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -1530,6 +1530,18 @@ int hvm_vcpu_initialise(struct vcpu *v) > > v->arch.hvm_vcpu.inject_event.vector = HVM_EVENT_VECTOR_UNSET; > > + /* > + * Leaving aside the insn fetch, for which we don't use this cache, no > + * insn can access more than 8 independent linear addresses (AVX2 > + * gathers being the worst). Each such linear range can span a page > + * boundary, i.e. require two page walks. > + */ > + v->arch.hvm_vcpu.data_cache = > hvmemul_cache_init(CONFIG_PAGING_LEVELS * > + 8 * 2); > + rc = -ENOMEM; > + if ( !v->arch.hvm_vcpu.data_cache ) > + goto fail4; > + > rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */ > if ( rc != 0 ) > goto fail4; > @@ -1559,6 +1571,7 @@ int hvm_vcpu_initialise(struct vcpu *v) > fail5: > free_compat_arg_xlat(v); > fail4: > + hvmemul_cache_destroy(v->arch.hvm_vcpu.data_cache); > hvm_funcs.vcpu_destroy(v); > fail3: > vlapic_destroy(v); > @@ -1581,6 +1594,8 @@ void hvm_vcpu_destroy(struct vcpu *v) > > free_compat_arg_xlat(v); > > + hvmemul_cache_destroy(v->arch.hvm_vcpu.data_cache); > + > tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet); > hvm_funcs.vcpu_destroy(v); > > @@ -2949,7 +2964,7 @@ void hvm_task_switch( > } > > rc = hvm_copy_from_guest_linear( > - &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo); > + &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL); > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > if ( rc != HVMTRANS_okay ) > @@ -2996,7 +3011,7 @@ void hvm_task_switch( > goto out; > > rc = hvm_copy_from_guest_linear( > - &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo); > + &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL); > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > /* > @@ -3107,7 +3122,7 @@ void hvm_task_switch( > enum hvm_translation_result hvm_translate_get_page( > struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, > pagefault_info_t *pfinfo, struct page_info **page_p, > - gfn_t *gfn_p, p2m_type_t *p2mt_p) > + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache) > { > struct page_info *page; > p2m_type_t p2mt; > @@ -3115,7 +3130,7 @@ enum hvm_translation_result hvm_translat > > if ( linear ) > { > - gfn = paging_gla_to_gfn(v, addr, &pfec, NULL); > + gfn = paging_gla_to_gfn(v, addr, &pfec, cache); > > if ( gfn_eq(gfn, INVALID_GFN) ) > { > @@ -3187,7 +3202,7 @@ enum hvm_translation_result hvm_translat > #define HVMCOPY_linear (1u<<2) > static enum hvm_translation_result __hvm_copy( > void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags, > - uint32_t pfec, pagefault_info_t *pfinfo) > + uint32_t pfec, pagefault_info_t *pfinfo, struct hvmemul_cache *cache) > { > gfn_t gfn; > struct page_info *page; > @@ -3220,8 +3235,8 @@ static enum hvm_translation_result __hvm > > count = min_t(int, PAGE_SIZE - gpa, todo); > > - res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, > - pfec, pfinfo, &page, &gfn, &p2mt); > + res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec, > + pfinfo, &page, &gfn, &p2mt, cache); > if ( res != HVMTRANS_okay ) > return res; > > @@ -3268,14 +3283,14 @@ enum hvm_translation_result hvm_copy_to_ > paddr_t paddr, void *buf, int size, struct vcpu *v) > { > return __hvm_copy(buf, paddr, size, v, > - HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL); > + HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL, NULL); > } > > enum hvm_translation_result hvm_copy_from_guest_phys( > void *buf, paddr_t paddr, int size) > { > return __hvm_copy(buf, paddr, size, current, > - HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL); > + HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL, NULL); > } > > enum hvm_translation_result hvm_copy_to_guest_linear( > @@ -3284,16 +3299,17 @@ enum hvm_translation_result hvm_copy_to_ > { > return __hvm_copy(buf, addr, size, current, > HVMCOPY_to_guest | HVMCOPY_linear, > - PFEC_page_present | PFEC_write_access | pfec, pfinfo); > + PFEC_page_present | PFEC_write_access | pfec, > + pfinfo, NULL); > } > > enum hvm_translation_result hvm_copy_from_guest_linear( > void *buf, unsigned long addr, int size, uint32_t pfec, > - pagefault_info_t *pfinfo) > + pagefault_info_t *pfinfo, struct hvmemul_cache *cache) > { > return __hvm_copy(buf, addr, size, current, > HVMCOPY_from_guest | HVMCOPY_linear, > - PFEC_page_present | pfec, pfinfo); > + PFEC_page_present | pfec, pfinfo, cache); > } > > enum hvm_translation_result hvm_fetch_from_guest_linear( > @@ -3302,7 +3318,7 @@ enum hvm_translation_result hvm_fetch_fr > { > return __hvm_copy(buf, addr, size, current, > HVMCOPY_from_guest | HVMCOPY_linear, > - PFEC_page_present | PFEC_insn_fetch | pfec, pfinfo); > + PFEC_page_present | PFEC_insn_fetch | pfec, pfinfo, > NULL); > } > > unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int > len) > @@ -3343,7 +3359,8 @@ unsigned long copy_from_user_hvm(void *t > return 0; > } > > - rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL); > + rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, > + 0, NULL, NULL); > return rc ? len : 0; /* fake a copy_from_user() return code */ > } > > --- a/xen/arch/x86/hvm/svm/svm.c > +++ b/xen/arch/x86/hvm/svm/svm.c > @@ -1358,7 +1358,7 @@ static void svm_emul_swint_injection(str > goto raise_exception; > > rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size, > - PFEC_implicit, &pfinfo); > + PFEC_implicit, &pfinfo, NULL); > if ( rc ) > { > if ( rc == HVMTRANS_bad_linear_to_gfn ) > --- a/xen/arch/x86/hvm/vmx/vvmx.c > +++ b/xen/arch/x86/hvm/vmx/vvmx.c > @@ -475,7 +475,7 @@ static int decode_vmx_inst(struct cpu_us > { > pagefault_info_t pfinfo; > int rc = hvm_copy_from_guest_linear(poperandS, base, size, > - 0, &pfinfo); > + 0, &pfinfo, NULL); > > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > --- a/xen/arch/x86/mm/shadow/common.c > +++ b/xen/arch/x86/mm/shadow/common.c > @@ -206,7 +206,7 @@ hvm_read(enum x86_segment seg, > if ( access_type == hvm_access_insn_fetch ) > rc = hvm_fetch_from_guest_linear(p_data, addr, bytes, 0, &pfinfo); > else > - rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo); > + rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo, > NULL); > > switch ( rc ) > { > --- a/xen/include/asm-x86/hvm/emulate.h > +++ b/xen/include/asm-x86/hvm/emulate.h > @@ -99,6 +99,11 @@ int hvmemul_do_pio_buffer(uint16_t port, > void *buffer); > > struct hvmemul_cache; > +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents); > +static inline void hvmemul_cache_destroy(struct hvmemul_cache *cache) > +{ > + xfree(cache); > +} > bool hvmemul_read_cache(const struct hvmemul_cache *, paddr_t gpa, > unsigned int level, void *buffer, unsigned int size); > void hvmemul_write_cache(struct hvmemul_cache *, paddr_t gpa, > --- a/xen/include/asm-x86/hvm/support.h > +++ b/xen/include/asm-x86/hvm/support.h > @@ -99,7 +99,7 @@ enum hvm_translation_result hvm_copy_to_ > pagefault_info_t *pfinfo); > enum hvm_translation_result hvm_copy_from_guest_linear( > void *buf, unsigned long addr, int size, uint32_t pfec, > - pagefault_info_t *pfinfo); > + pagefault_info_t *pfinfo, struct hvmemul_cache *cache); > enum hvm_translation_result hvm_fetch_from_guest_linear( > void *buf, unsigned long addr, int size, uint32_t pfec, > pagefault_info_t *pfinfo); > @@ -113,7 +113,7 @@ enum hvm_translation_result hvm_fetch_fr > enum hvm_translation_result hvm_translate_get_page( > struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, > pagefault_info_t *pfinfo, struct page_info **page_p, > - gfn_t *gfn_p, p2m_type_t *p2mt_p); > + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache); > > #define HVM_HCALL_completed 0 /* hypercall completed - no further > action */ > #define HVM_HCALL_preempted 1 /* hypercall preempted - re-execute > VMCALL */ > --- a/xen/include/asm-x86/hvm/vcpu.h > +++ b/xen/include/asm-x86/hvm/vcpu.h > @@ -53,8 +53,6 @@ struct hvm_mmio_cache { > uint8_t buffer[32]; > }; > > -struct hvmemul_cache; > - > struct hvm_vcpu_io { > /* I/O request in flight to device model. */ > enum hvm_io_completion io_completion; > @@ -200,6 +198,7 @@ struct hvm_vcpu { > u8 cache_mode; > > struct hvm_vcpu_io hvm_io; > + struct hvmemul_cache *data_cache; > > /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */ > struct x86_event inject_event; > > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.