[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] vMCE vs migration



On Tue, 2012-01-31 at 11:27 +0000, George Dunlap wrote:
> On Mon, 2012-01-30 at 13:47 +0000, Jan Beulich wrote:
> > >>> On 26.01.12 at 17:54, George Dunlap <george.dunlap@xxxxxxxxxx> wrote:
> > > On Tue, 2012-01-24 at 11:08 +0000, Jan Beulich wrote:
> > >> >>> On 24.01.12 at 11:29, George Dunlap <George.Dunlap@xxxxxxxxxxxxx> 
> > >> >>> wrote:
> > >> > On Mon, Jan 23, 2012 at 11:08 AM, Jan Beulich <JBeulich@xxxxxxxx> 
> > >> > wrote:
> > >> >> x86's vMCE implementation lets a guest know of as many MCE reporting
> > >> >> banks as there are in the host. While a PV guest could be expected to
> > >> >> deal with this number changing (particularly decreasing) during 
> > >> >> migration
> > >> >> (not currently handled anywhere afaict), for HVM guests this is 
> > >> >> certainly
> > >> >> wrong.
> > >> >>
> > >> >> At least to me it isn't, however, clear how to properly handle this. 
> > >> >> The
> > >> >> easiest would appear to be to save and restore the number of banks
> > >> >> the guest was made believe it can access, making vmce_{rd,wr}msr()
> > >> >> silently tolerate accesses between the host and guest values.
> > >> >
> > >> > We ran into this in the XS 6.0 release as well.  I think that the
> > >> > ideal thing to do would be to have a parameter that can be set at
> > >> > boot, to say how many vMCE banks a guest has, defaulting to the number
> > >> > of MCE banks on the host.  This parameter would be preserved across
> > >> > migration.  Ideally, a pool-aware toolstack like xapi would then set
> > >> > this value to be the value of the host in the pool with the largest
> > >> > number of banks, allowing a guest to access all the banks on any host
> > >> > to which it migrates.
> > >> >
> > >> > What do you think?
> > >>
> > >> That sounds like the way to go.
> > >
> > > So should we put this on IanC's to-do-be-done list?  Are you going to
> > > put it on your to-do list? :-)
> > 
> > Below/attached a draft patch (compile tested only), handling save/
> > restore of the bank count, but not allowing for a config setting to
> > specify its initial value (yet).
> 
> Looks pretty good for a first blush.  Just one question: Why is the vmce
> count made on a per-vcpu basis, rather than on a per-domain basis, like
> the actual banks are?  Is the host MCE stuff per-vcpu?

(Per-pcpu, that is...)

> 
>  -George
> 
> > +static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t 
> > *val)
> > +{
> > +    int ret = 1;
> > +    unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
> > +    struct domain_mca_msrs *vmce = dom_vmce(v->domain);
> >      struct bank_entry *entry;
> > 
> > -    bank = (msr - MSR_IA32_MC0_CTL) / 4;
> > -    if ( bank >= nr_mce_banks )
> > -        return -1;
> > +    *val = 0;
> > 
> >      switch ( msr & (MSR_IA32_MC0_CTL | 3) )
> >      {
> >      case MSR_IA32_MC0_CTL:
> > -        *val = vmce->mci_ctl[bank] &
> > -            (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
> > +        if ( bank < nr_mce_banks )
> > +            *val = vmce->mci_ctl[bank] &
> > +                   (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
> >          mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
> >                     bank, *val);
> >          break;
> > @@ -126,7 +132,7 @@ static int bank_mce_rdmsr(struct domain
> >          switch ( boot_cpu_data.x86_vendor )
> >          {
> >          case X86_VENDOR_INTEL:
> > -            ret = intel_mce_rdmsr(msr, val);
> > +            ret = intel_mce_rdmsr(v, msr, val);
> >              break;
> >          default:
> >              ret = 0;
> > @@ -145,13 +151,13 @@ static int bank_mce_rdmsr(struct domain
> >   */
> >  int vmce_rdmsr(uint32_t msr, uint64_t *val)
> >  {
> > -    struct domain *d = current->domain;
> > -    struct domain_mca_msrs *vmce = dom_vmce(d);
> > +    const struct vcpu *cur = current;
> > +    struct domain_mca_msrs *vmce = dom_vmce(cur->domain);
> >      int ret = 1;
> > 
> >      *val = 0;
> > 
> > -    spin_lock(&dom_vmce(d)->lock);
> > +    spin_lock(&vmce->lock);
> > 
> >      switch ( msr )
> >      {
> > @@ -173,28 +179,26 @@ int vmce_rdmsr(uint32_t msr, uint64_t *v
> >                     *val);
> >          break;
> >      default:
> > -        ret = mce_bank_msr(msr) ? bank_mce_rdmsr(d, msr, val) : 0;
> > +        ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
> >          break;
> >      }
> > 
> > -    spin_unlock(&dom_vmce(d)->lock);
> > +    spin_unlock(&vmce->lock);
> >      return ret;
> >  }
> > 
> > -static int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val)
> > +static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val)
> >  {
> > -    int bank, ret = 1;
> > -    struct domain_mca_msrs *vmce = dom_vmce(d);
> > +    int ret = 1;
> > +    unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
> > +    struct domain_mca_msrs *vmce = dom_vmce(v->domain);
> >      struct bank_entry *entry = NULL;
> > 
> > -    bank = (msr - MSR_IA32_MC0_CTL) / 4;
> > -    if ( bank >= nr_mce_banks )
> > -        return -EINVAL;
> > -
> >      switch ( msr & (MSR_IA32_MC0_CTL | 3) )
> >      {
> >      case MSR_IA32_MC0_CTL:
> > -        vmce->mci_ctl[bank] = val;
> > +        if ( bank < nr_mce_banks )
> > +            vmce->mci_ctl[bank] = val;
> >          break;
> >      case MSR_IA32_MC0_STATUS:
> >          /* Give the first entry of the list, it corresponds to current
> > @@ -202,9 +206,9 @@ static int bank_mce_wrmsr(struct domain
> >           * the guest, this node will be deleted.
> >           * Only error bank is written. Non-error banks simply return.
> >           */
> > -        if ( !list_empty(&dom_vmce(d)->impact_header) )
> > +        if ( !list_empty(&vmce->impact_header) )
> >          {
> > -            entry = list_entry(dom_vmce(d)->impact_header.next,
> > +            entry = list_entry(vmce->impact_header.next,
> >                                 struct bank_entry, list);
> >              if ( entry->bank == bank )
> >                  entry->mci_status = val;
> > @@ -228,7 +232,7 @@ static int bank_mce_wrmsr(struct domain
> >          switch ( boot_cpu_data.x86_vendor )
> >          {
> >          case X86_VENDOR_INTEL:
> > -            ret = intel_mce_wrmsr(msr, val);
> > +            ret = intel_mce_wrmsr(v, msr, val);
> >              break;
> >          default:
> >              ret = 0;
> > @@ -247,9 +251,9 @@ static int bank_mce_wrmsr(struct domain
> >   */
> >  int vmce_wrmsr(u32 msr, u64 val)
> >  {
> > -    struct domain *d = current->domain;
> > +    struct vcpu *cur = current;
> >      struct bank_entry *entry = NULL;
> > -    struct domain_mca_msrs *vmce = dom_vmce(d);
> > +    struct domain_mca_msrs *vmce = dom_vmce(cur->domain);
> >      int ret = 1;
> > 
> >      if ( !g_mcg_cap )
> > @@ -266,7 +270,7 @@ int vmce_wrmsr(u32 msr, u64 val)
> >          vmce->mcg_status = val;
> >          mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val);
> >          /* For HVM guest, this is the point for deleting vMCE injection 
> > node */
> > -        if ( d->is_hvm && (vmce->nr_injection > 0) )
> > +        if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) )
> >          {
> >              vmce->nr_injection--; /* Should be 0 */
> >              if ( !list_empty(&vmce->impact_header) )
> > @@ -293,7 +297,7 @@ int vmce_wrmsr(u32 msr, u64 val)
> >          ret = -1;
> >          break;
> >      default:
> > -        ret = mce_bank_msr(msr) ? bank_mce_wrmsr(d, msr, val) : 0;
> > +        ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
> >          break;
> >      }
> > 
> > @@ -301,6 +305,47 @@ int vmce_wrmsr(u32 msr, u64 val)
> >      return ret;
> >  }
> > 
> > +static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
> > +{
> > +    struct vcpu *v;
> > +    int err = 0;
> > +
> > +    for_each_vcpu( d, v ) {
> > +        struct hvm_vmce_vcpu ctxt = {
> > +            .nr_banks = v->arch.nr_vmce_banks
> > +        };
> > +
> > +        err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
> > +        if ( err )
> > +            break;
> > +    }
> > +
> > +    return err;
> > +}
> > +
> > +static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
> > +{
> > +    unsigned int vcpuid = hvm_load_instance(h);
> > +    struct vcpu *v;
> > +    struct hvm_vmce_vcpu ctxt;
> > +    int err;
> > +
> > +    if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
> > +    {
> > +        gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", 
> > vcpuid);
> > +        return -EINVAL;
> > +    }
> > +
> > +    err = hvm_load_entry(VMCE_VCPU, h, &ctxt);
> > +    if ( !err )
> > +        v->arch.nr_vmce_banks = ctxt.nr_banks;
> > +
> > +    return err;
> > +}
> > +
> > +HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
> > +                          vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
> > +
> >  int inject_vmce(struct domain *d)
> >  {
> >      int cpu = smp_processor_id();
> > --- a/xen/arch/x86/domain.c
> > +++ b/xen/arch/x86/domain.c
> > @@ -468,6 +468,8 @@ int vcpu_initialise(struct vcpu *v)
> > 
> >      v->arch.pv_vcpu.ctrlreg[4] = 
> > real_cr4_to_pv_guest_cr4(mmu_cr4_features);
> > 
> > +    vmce_init_vcpu(v);
> > +
> >      rc = is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0;
> >   done:
> >      if ( rc )
> > --- a/xen/arch/x86/domctl.c
> > +++ b/xen/arch/x86/domctl.c
> > @@ -1027,11 +1027,12 @@ long arch_do_domctl(
> >                  evc->syscall32_callback_eip    = 0;
> >                  evc->syscall32_disables_events = 0;
> >              }
> > +            evc->nr_mce_banks = v->arch.nr_vmce_banks;
> >          }
> >          else
> >          {
> >              ret = -EINVAL;
> > -            if ( evc->size != sizeof(*evc) )
> > +            if ( evc->size < offsetof(typeof(*evc), nr_mce_banks) )
> >                  goto ext_vcpucontext_out;
> >  #ifdef __x86_64__
> >              if ( !is_hvm_domain(d) )
> > @@ -1059,6 +1060,16 @@ long arch_do_domctl(
> >                   (evc->syscall32_callback_cs & ~3) ||
> >                   evc->syscall32_callback_eip )
> >                  goto ext_vcpucontext_out;
> > +
> > +            if ( evc->size >= offsetof(typeof(*evc), 
> > mbz[ARRAY_SIZE(evc->mbz)]) )
> > +            {
> > +                unsigned int i;
> > +
> > +                for ( i = 0; i < ARRAY_SIZE(evc->mbz); ++i )
> > +                    if ( evc->mbz[i] )
> > +                        goto ext_vcpucontext_out;
> > +                v->arch.nr_vmce_banks = evc->nr_mce_banks;
> > +            }
> >          }
> > 
> >          ret = 0;
> > --- a/xen/include/asm-x86/domain.h
> > +++ b/xen/include/asm-x86/domain.h
> > @@ -488,6 +488,8 @@ struct arch_vcpu
> >      /* This variable determines whether nonlazy extended state has been 
> > used,
> >       * and thus should be saved/restored. */
> >      bool_t nonlazy_xstate_used;
> > +
> > +    uint8_t nr_vmce_banks;
> > 
> >      struct paging_vcpu paging;
> > 
> > --- a/xen/include/asm-x86/mce.h
> > +++ b/xen/include/asm-x86/mce.h
> > @@ -28,6 +28,7 @@ struct domain_mca_msrs
> >  /* Guest vMCE MSRs virtualization */
> >  extern int vmce_init_msr(struct domain *d);
> >  extern void vmce_destroy_msr(struct domain *d);
> > +extern void vmce_init_vcpu(struct vcpu *);
> >  extern int vmce_wrmsr(uint32_t msr, uint64_t val);
> >  extern int vmce_rdmsr(uint32_t msr, uint64_t *val);
> > 
> > --- a/xen/include/public/arch-x86/hvm/save.h
> > +++ b/xen/include/public/arch-x86/hvm/save.h
> > @@ -585,9 +585,15 @@ struct hvm_viridian_vcpu_context {
> > 
> >  DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct hvm_viridian_vcpu_context);
> > 
> > +struct hvm_vmce_vcpu {
> > +    uint8_t nr_banks;
> > +};
> > +
> > +DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu);
> > +
> >  /*
> >   * Largest type-code in use
> >   */
> > -#define HVM_SAVE_CODE_MAX 17
> > +#define HVM_SAVE_CODE_MAX 18
> > 
> >  #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
> > --- a/xen/include/public/domctl.h
> > +++ b/xen/include/public/domctl.h
> > @@ -559,7 +559,7 @@ struct xen_domctl_ext_vcpucontext {
> >      uint32_t         vcpu;
> >      /*
> >       * SET: Size of struct (IN)
> > -     * GET: Size of struct (OUT)
> > +     * GET: Size of struct (OUT, up to 128 bytes)
> >       */
> >      uint32_t         size;
> >  #if defined(__i386__) || defined(__x86_64__)
> > @@ -571,6 +571,10 @@ struct xen_domctl_ext_vcpucontext {
> >      uint16_t         sysenter_callback_cs;
> >      uint8_t          syscall32_disables_events;
> >      uint8_t          sysenter_disables_events;
> > +    uint8_t          nr_mce_banks;
> > +    /* This array can be split and used for future extension. */
> > +    /* It is there just to grow the structure beyond 4.1's size. */
> > +    uint8_t          mbz[9];
> >  #endif
> >  };
> >  typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;
> > 
> 
> 



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.