[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 3/6] x86/HVM: implement memory read caching



> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@xxxxxxxx]
> Sent: 19 July 2018 11:49
> To: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxxx>
> Cc: Brian Woods <brian.woods@xxxxxxx>; Suravee Suthikulpanit
> <suravee.suthikulpanit@xxxxxxx>; Andrew Cooper
> <Andrew.Cooper3@xxxxxxxxxx>; Paul Durrant <Paul.Durrant@xxxxxxxxxx>;
> George Dunlap <George.Dunlap@xxxxxxxxxx>; Jun Nakajima
> <jun.nakajima@xxxxxxxxx>; Kevin Tian <kevin.tian@xxxxxxxxx>; Boris
> Ostrovsky <boris.ostrovsky@xxxxxxxxxx>; Tim (Xen.org) <tim@xxxxxxx>
> Subject: [PATCH 3/6] x86/HVM: implement memory read caching
> 
> Emulation requiring device model assistance uses a form of instruction
> re-execution, assuming that the second (and any further) pass takes
> exactly the same path. This is a valid assumption as far use of CPU
> registers goes (as those can't change without any other instruction
> executing in between), but is wrong for memory accesses. In particular
> it has been observed that Windows might page out buffers underneath an
> instruction currently under emulation (hitting between two passes). If
> the first pass translated a linear address successfully, any subsequent
> pass needs to do so too, yielding the exact same translation.
> 
> Introduce a cache (used just by guest page table accesses for now) to
> make sure above described assumption holds. This is a very simplistic
> implementation for now: Only exact matches are satisfied (no overlaps or
> partial reads or anything).
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -27,6 +27,18 @@
>  #include <asm/hvm/svm/svm.h>
>  #include <asm/vm_event.h>
> 
> +struct hvmemul_cache
> +{
> +    unsigned int max_ents;
> +    unsigned int num_ents;
> +    struct {
> +        paddr_t gpa:PADDR_BITS;
> +        unsigned int size:(BITS_PER_LONG - PADDR_BITS) / 2;
> +        unsigned int level:(BITS_PER_LONG - PADDR_BITS) / 2;
> +        unsigned long data;
> +    } ents[];
> +};
> +
>  static void hvmtrace_io_assist(const ioreq_t *p)
>  {
>      unsigned int size, event;
> @@ -520,7 +532,7 @@ static int hvmemul_do_mmio_addr(paddr_t
>   */
>  static void *hvmemul_map_linear_addr(
>      unsigned long linear, unsigned int bytes, uint32_t pfec,
> -    struct hvm_emulate_ctxt *hvmemul_ctxt)
> +    struct hvm_emulate_ctxt *hvmemul_ctxt, struct hvmemul_cache
> *cache)
>  {
>      struct vcpu *curr = current;
>      void *err, *mapping;
> @@ -565,7 +577,7 @@ static void *hvmemul_map_linear_addr(
>          ASSERT(mfn_x(*mfn) == 0);
> 
>          res = hvm_translate_get_page(curr, addr, true, pfec,
> -                                     &pfinfo, &page, NULL, &p2mt);
> +                                     &pfinfo, &page, NULL, &p2mt, cache);
> 
>          switch ( res )
>          {
> @@ -681,6 +693,8 @@ static int hvmemul_linear_to_phys(
>      gfn_t gfn, ngfn;
>      unsigned long done, todo, i, offset = addr & ~PAGE_MASK;
>      int reverse;
> +    struct hvmemul_cache *cache = pfec & PFEC_insn_fetch
> +                                  ? NULL : curr->arch.hvm_vcpu.data_cache;
> 
>      /*
>       * Clip repetitions to a sensible maximum. This avoids extensive looping 
> in
> @@ -710,7 +724,7 @@ static int hvmemul_linear_to_phys(
>              return rc;
>          gfn = gaddr_to_gfn(gaddr);
>      }
> -    else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, NULL),
> +    else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, cache),
>                       INVALID_GFN) )
>      {
>          if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
> @@ -726,7 +740,7 @@ static int hvmemul_linear_to_phys(
>      {
>          /* Get the next PFN in the range. */
>          addr += reverse ? -PAGE_SIZE : PAGE_SIZE;
> -        ngfn = paging_gla_to_gfn(curr, addr, &pfec, NULL);
> +        ngfn = paging_gla_to_gfn(curr, addr, &pfec, cache);
> 
>          /* Is it contiguous with the preceding PFNs? If not then we're done. 
> */
>          if ( gfn_eq(ngfn, INVALID_GFN) ||
> @@ -1046,6 +1060,8 @@ static int __hvmemul_read(
>          pfec |= PFEC_implicit;
>      else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
>          pfec |= PFEC_user_mode;
> +    if ( access_type == hvm_access_insn_fetch )
> +        pfec |= PFEC_insn_fetch;

Since you OR the insn_fetch flag in here...

> 
>      rc = hvmemul_virtual_to_linear(
>          seg, offset, bytes, &reps, access_type, hvmemul_ctxt, &addr);
> @@ -1059,7 +1075,8 @@ static int __hvmemul_read(
> 
>      rc = ((access_type == hvm_access_insn_fetch) ?
>            hvm_fetch_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo) :

...could you not just use hvm_copy_from_guest_linear() here regardless of 
access_type (and just NULL out the cache argument if it is an insn_fetch)?

AFAICT the only reason hvm_fetch_from_guest_linear() exists is to OR the extra 
flag in.

  Paul

> -          hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo));
> +          hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo,
> +                                     curr->arch.hvm_vcpu.data_cache));
> 
>      switch ( rc )
>      {
> @@ -1178,7 +1195,8 @@ static int hvmemul_write(
>           (vio->mmio_gla == (addr & PAGE_MASK)) )
>          return hvmemul_linear_mmio_write(addr, bytes, p_data, pfec,
> hvmemul_ctxt, 1);
> 
> -    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> +                                      curr->arch.hvm_vcpu.data_cache);
>      if ( IS_ERR(mapping) )
>          return ~PTR_ERR(mapping);
> 
> @@ -1218,7 +1236,8 @@ static int hvmemul_rmw(
>      else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
>          pfec |= PFEC_user_mode;
> 
> -    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> +                                      current->arch.hvm_vcpu.data_cache);
>      if ( IS_ERR(mapping) )
>          return ~PTR_ERR(mapping);
> 
> @@ -1375,7 +1394,8 @@ static int hvmemul_cmpxchg(
>      else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
>          pfec |= PFEC_user_mode;
> 
> -    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> +                                      curr->arch.hvm_vcpu.data_cache);
>      if ( IS_ERR(mapping) )
>          return ~PTR_ERR(mapping);
> 
> @@ -2282,6 +2302,7 @@ static int _hvm_emulate_one(struct hvm_e
>      {
>          vio->mmio_cache_count = 0;
>          vio->mmio_insn_bytes = 0;
> +        curr->arch.hvm_vcpu.data_cache->num_ents = 0;
>      }
>      else
>      {
> @@ -2572,9 +2593,35 @@ void hvm_dump_emulation_state(const char
>             hvmemul_ctxt->insn_buf);
>  }
> 
> +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents)
> +{
> +    struct hvmemul_cache *cache = xmalloc_bytes(offsetof(struct
> hvmemul_cache,
> +                                                         ents[nents]));
> +
> +    if ( cache )
> +    {
> +        cache->num_ents = 0;
> +        cache->max_ents = nents;
> +    }
> +
> +    return cache;
> +}
> +
>  bool hvmemul_read_cache(const struct hvmemul_cache *cache, paddr_t
> gpa,
>                          unsigned int level, void *buffer, unsigned int size)
>  {
> +    unsigned int i;
> +
> +    ASSERT(size <= sizeof(cache->ents->data));
> +
> +    for ( i = 0; i < cache->num_ents; ++i )
> +        if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa &&
> +             cache->ents[i].size == size )
> +        {
> +            memcpy(buffer, &cache->ents[i].data, size);
> +            return true;
> +        }
> +
>      return false;
>  }
> 
> @@ -2582,6 +2629,35 @@ void hvmemul_write_cache(struct hvmemul_
>                           unsigned int level, const void *buffer,
>                           unsigned int size)
>  {
> +    unsigned int i;
> +
> +    if ( size > sizeof(cache->ents->data) )
> +    {
> +        ASSERT_UNREACHABLE();
> +        return;
> +    }
> +
> +    for ( i = 0; i < cache->num_ents; ++i )
> +        if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa &&
> +             cache->ents[i].size == size )
> +        {
> +            memcpy(&cache->ents[i].data, buffer, size);
> +            return;
> +        }
> +
> +    if ( unlikely(i >= cache->max_ents) )
> +    {
> +        ASSERT_UNREACHABLE();
> +        return;
> +    }
> +
> +    cache->ents[i].level = level;
> +    cache->ents[i].gpa   = gpa;
> +    cache->ents[i].size  = size;
> +
> +    memcpy(&cache->ents[i].data, buffer, size);
> +
> +    cache->num_ents = i + 1;
>  }
> 
>  /*
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1530,6 +1530,18 @@ int hvm_vcpu_initialise(struct vcpu *v)
> 
>      v->arch.hvm_vcpu.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
> 
> +    /*
> +     * Leaving aside the insn fetch, for which we don't use this cache, no
> +     * insn can access more than 8 independent linear addresses (AVX2
> +     * gathers being the worst). Each such linear range can span a page
> +     * boundary, i.e. require two page walks.
> +     */
> +    v->arch.hvm_vcpu.data_cache =
> hvmemul_cache_init(CONFIG_PAGING_LEVELS *
> +                                                     8 * 2);
> +    rc = -ENOMEM;
> +    if ( !v->arch.hvm_vcpu.data_cache )
> +        goto fail4;
> +
>      rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */
>      if ( rc != 0 )
>          goto fail4;
> @@ -1559,6 +1571,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
>   fail5:
>      free_compat_arg_xlat(v);
>   fail4:
> +    hvmemul_cache_destroy(v->arch.hvm_vcpu.data_cache);
>      hvm_funcs.vcpu_destroy(v);
>   fail3:
>      vlapic_destroy(v);
> @@ -1581,6 +1594,8 @@ void hvm_vcpu_destroy(struct vcpu *v)
> 
>      free_compat_arg_xlat(v);
> 
> +    hvmemul_cache_destroy(v->arch.hvm_vcpu.data_cache);
> +
>      tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
>      hvm_funcs.vcpu_destroy(v);
> 
> @@ -2949,7 +2964,7 @@ void hvm_task_switch(
>      }
> 
>      rc = hvm_copy_from_guest_linear(
> -        &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
> +        &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL);
>      if ( rc == HVMTRANS_bad_linear_to_gfn )
>          hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
>      if ( rc != HVMTRANS_okay )
> @@ -2996,7 +3011,7 @@ void hvm_task_switch(
>          goto out;
> 
>      rc = hvm_copy_from_guest_linear(
> -        &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
> +        &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL);
>      if ( rc == HVMTRANS_bad_linear_to_gfn )
>          hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
>      /*
> @@ -3107,7 +3122,7 @@ void hvm_task_switch(
>  enum hvm_translation_result hvm_translate_get_page(
>      struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
>      pagefault_info_t *pfinfo, struct page_info **page_p,
> -    gfn_t *gfn_p, p2m_type_t *p2mt_p)
> +    gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache)
>  {
>      struct page_info *page;
>      p2m_type_t p2mt;
> @@ -3115,7 +3130,7 @@ enum hvm_translation_result hvm_translat
> 
>      if ( linear )
>      {
> -        gfn = paging_gla_to_gfn(v, addr, &pfec, NULL);
> +        gfn = paging_gla_to_gfn(v, addr, &pfec, cache);
> 
>          if ( gfn_eq(gfn, INVALID_GFN) )
>          {
> @@ -3187,7 +3202,7 @@ enum hvm_translation_result hvm_translat
>  #define HVMCOPY_linear     (1u<<2)
>  static enum hvm_translation_result __hvm_copy(
>      void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags,
> -    uint32_t pfec, pagefault_info_t *pfinfo)
> +    uint32_t pfec, pagefault_info_t *pfinfo, struct hvmemul_cache *cache)
>  {
>      gfn_t gfn;
>      struct page_info *page;
> @@ -3220,8 +3235,8 @@ static enum hvm_translation_result __hvm
> 
>          count = min_t(int, PAGE_SIZE - gpa, todo);
> 
> -        res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear,
> -                                     pfec, pfinfo, &page, &gfn, &p2mt);
> +        res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec,
> +                                     pfinfo, &page, &gfn, &p2mt, cache);
>          if ( res != HVMTRANS_okay )
>              return res;
> 
> @@ -3268,14 +3283,14 @@ enum hvm_translation_result hvm_copy_to_
>      paddr_t paddr, void *buf, int size, struct vcpu *v)
>  {
>      return __hvm_copy(buf, paddr, size, v,
> -                      HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL);
> +                      HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL, NULL);
>  }
> 
>  enum hvm_translation_result hvm_copy_from_guest_phys(
>      void *buf, paddr_t paddr, int size)
>  {
>      return __hvm_copy(buf, paddr, size, current,
> -                      HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL);
> +                      HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL, NULL);
>  }
> 
>  enum hvm_translation_result hvm_copy_to_guest_linear(
> @@ -3284,16 +3299,17 @@ enum hvm_translation_result hvm_copy_to_
>  {
>      return __hvm_copy(buf, addr, size, current,
>                        HVMCOPY_to_guest | HVMCOPY_linear,
> -                      PFEC_page_present | PFEC_write_access | pfec, pfinfo);
> +                      PFEC_page_present | PFEC_write_access | pfec,
> +                      pfinfo, NULL);
>  }
> 
>  enum hvm_translation_result hvm_copy_from_guest_linear(
>      void *buf, unsigned long addr, int size, uint32_t pfec,
> -    pagefault_info_t *pfinfo)
> +    pagefault_info_t *pfinfo, struct hvmemul_cache *cache)
>  {
>      return __hvm_copy(buf, addr, size, current,
>                        HVMCOPY_from_guest | HVMCOPY_linear,
> -                      PFEC_page_present | pfec, pfinfo);
> +                      PFEC_page_present | pfec, pfinfo, cache);
>  }
> 
>  enum hvm_translation_result hvm_fetch_from_guest_linear(
> @@ -3302,7 +3318,7 @@ enum hvm_translation_result hvm_fetch_fr
>  {
>      return __hvm_copy(buf, addr, size, current,
>                        HVMCOPY_from_guest | HVMCOPY_linear,
> -                      PFEC_page_present | PFEC_insn_fetch | pfec, pfinfo);
> +                      PFEC_page_present | PFEC_insn_fetch | pfec, pfinfo, 
> NULL);
>  }
> 
>  unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int
> len)
> @@ -3343,7 +3359,8 @@ unsigned long copy_from_user_hvm(void *t
>          return 0;
>      }
> 
> -    rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL);
> +    rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len,
> +                                    0, NULL, NULL);
>      return rc ? len : 0; /* fake a copy_from_user() return code */
>  }
> 
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1358,7 +1358,7 @@ static void svm_emul_swint_injection(str
>          goto raise_exception;
> 
>      rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size,
> -                                    PFEC_implicit, &pfinfo);
> +                                    PFEC_implicit, &pfinfo, NULL);
>      if ( rc )
>      {
>          if ( rc == HVMTRANS_bad_linear_to_gfn )
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -475,7 +475,7 @@ static int decode_vmx_inst(struct cpu_us
>          {
>              pagefault_info_t pfinfo;
>              int rc = hvm_copy_from_guest_linear(poperandS, base, size,
> -                                                0, &pfinfo);
> +                                                0, &pfinfo, NULL);
> 
>              if ( rc == HVMTRANS_bad_linear_to_gfn )
>                  hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -206,7 +206,7 @@ hvm_read(enum x86_segment seg,
>      if ( access_type == hvm_access_insn_fetch )
>          rc = hvm_fetch_from_guest_linear(p_data, addr, bytes, 0, &pfinfo);
>      else
> -        rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo);
> +        rc = hvm_copy_from_guest_linear(p_data, addr, bytes, 0, &pfinfo,
> NULL);
> 
>      switch ( rc )
>      {
> --- a/xen/include/asm-x86/hvm/emulate.h
> +++ b/xen/include/asm-x86/hvm/emulate.h
> @@ -99,6 +99,11 @@ int hvmemul_do_pio_buffer(uint16_t port,
>                            void *buffer);
> 
>  struct hvmemul_cache;
> +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents);
> +static inline void hvmemul_cache_destroy(struct hvmemul_cache *cache)
> +{
> +    xfree(cache);
> +}
>  bool hvmemul_read_cache(const struct hvmemul_cache *, paddr_t gpa,
>                          unsigned int level, void *buffer, unsigned int size);
>  void hvmemul_write_cache(struct hvmemul_cache *, paddr_t gpa,
> --- a/xen/include/asm-x86/hvm/support.h
> +++ b/xen/include/asm-x86/hvm/support.h
> @@ -99,7 +99,7 @@ enum hvm_translation_result hvm_copy_to_
>      pagefault_info_t *pfinfo);
>  enum hvm_translation_result hvm_copy_from_guest_linear(
>      void *buf, unsigned long addr, int size, uint32_t pfec,
> -    pagefault_info_t *pfinfo);
> +    pagefault_info_t *pfinfo, struct hvmemul_cache *cache);
>  enum hvm_translation_result hvm_fetch_from_guest_linear(
>      void *buf, unsigned long addr, int size, uint32_t pfec,
>      pagefault_info_t *pfinfo);
> @@ -113,7 +113,7 @@ enum hvm_translation_result hvm_fetch_fr
>  enum hvm_translation_result hvm_translate_get_page(
>      struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
>      pagefault_info_t *pfinfo, struct page_info **page_p,
> -    gfn_t *gfn_p, p2m_type_t *p2mt_p);
> +    gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache);
> 
>  #define HVM_HCALL_completed  0 /* hypercall completed - no further
> action */
>  #define HVM_HCALL_preempted  1 /* hypercall preempted - re-execute
> VMCALL */
> --- a/xen/include/asm-x86/hvm/vcpu.h
> +++ b/xen/include/asm-x86/hvm/vcpu.h
> @@ -53,8 +53,6 @@ struct hvm_mmio_cache {
>      uint8_t buffer[32];
>  };
> 
> -struct hvmemul_cache;
> -
>  struct hvm_vcpu_io {
>      /* I/O request in flight to device model. */
>      enum hvm_io_completion io_completion;
> @@ -200,6 +198,7 @@ struct hvm_vcpu {
>      u8                  cache_mode;
> 
>      struct hvm_vcpu_io  hvm_io;
> +    struct hvmemul_cache *data_cache;
> 
>      /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */
>      struct x86_event     inject_event;
> 
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.