|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH RFC] pass-through: sync pir to irr after msix vector been updated
On Thu, Sep 26, 2019 at 01:33:42PM -0700, Joe Jin wrote:
> On 9/24/19 8:42 AM, Roger Pau Monné wrote:
> > On Fri, Sep 13, 2019 at 09:50:34AM -0700, Joe Jin wrote:
> >> On 9/13/19 3:33 AM, Roger Pau Monné wrote:
> >>> On Thu, Sep 12, 2019 at 11:03:14AM -0700, Joe Jin wrote:
> >>>> With below testcase, guest kernel reported "No irq handler for vector":
> >>>> 1). Passthrough mlx ib VF to 2 pvhvm guests.
> >>>> 2). Start rds-stress between 2 guests.
> >>>> 3). Scale down 2 guests vcpu from 32 to 6 at the same time.
> >>>>
> >>>> Repeat above test several iteration, guest kernel reported "No irq
> >>>> handler
> >>>> for vector", and IB traffic downed to zero which caused by interrupt
> >>>> lost.
> >>>>
> >>>> When vcpu offline, kernel disabled local IRQ, migrate IRQ to other cpu,
> >>>> update MSI-X table, enable IRQ. If any new interrupt arrived after
> >>>> local IRQ disabled also before MSI-X table been updated, interrupt still
> >>>> used old vector and dest cpu info, and when local IRQ enabled again,
> >>>> interrupt been sent to wrong cpu and vector.
> >>>
> >>> Yes, but that's something Linux shoulkd be able to handle, according
> >>> to your description there's a window where interrupts can be delivered
> >>> to the old CPU, but that's something expected.
> >>
> >> Actually, kernel will check APIC IRR when do migration, if any pending
> >> IRQ, will retrigger IRQ to new destination, but Xen does not set the
> >> bit.
> >
> > Right, because the interrupt is pending in the PIRR posted descriptor
> > field, it has not yet reached the vlapic IRR.
> >
> >>>
> >>>>
> >>>> Looks sync PIR to IRR after MSI-X been updated is help for this issue.
> >>>
> >>> AFAICT the sync that you do is still using the old vcpu id, as
> >>> pirq_dpci->gmsi.dest_vcpu_id gets updated a little bit below. I'm
> >>> unsure about why does this help, I would expect the sync between pir
> >>> and irr to happen anyway, and hence I'm not sure why is this helping.
> >>
> >> As my above update, IRQ retriggered by old cpu, so Xen need to set IRR
> >> for old cpu but not of new.
> >
> > AFAICT you are draining any pending data from the posted interrupt
> > PIRR field into the IRR vlapic field, so that no stale interrupts are
> > left behind after the MSI fields have been updated by the guest. I
> > think this is correct, I wonder however whether you also need to
> > trigger a vcpu re-scheduling (pause/unpause the vpcu), so that pending
> > interrupts in IRR are injected by vmx_intr_assist.
> >
> > Also, I think you should do this syncing after the pi_update_irte
> > call, or else there's still a window (albeit small) where you can
> > still get posted interrupts delivered to the old vcpu.
>
> I agree with you that we need to take care of this issue as well.
Can you give a try to the patch above? I don't have the hardware to
test any of this ATM, so your help would be appreciated.
Thanks, Roger.
---8<---
diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
index 9466258d6f..d255ad8db7 100644
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -106,7 +106,7 @@ static void vlapic_clear_irr(int vector, struct vlapic
*vlapic)
vlapic_clear_vector(vector, &vlapic->regs->data[APIC_IRR]);
}
-static void sync_pir_to_irr(struct vcpu *v)
+void vlapic_sync_pir_to_irr(struct vcpu *v)
{
if ( hvm_funcs.sync_pir_to_irr )
alternative_vcall(hvm_funcs.sync_pir_to_irr, v);
@@ -114,7 +114,7 @@ static void sync_pir_to_irr(struct vcpu *v)
static int vlapic_find_highest_irr(struct vlapic *vlapic)
{
- sync_pir_to_irr(vlapic_vcpu(vlapic));
+ vlapic_sync_pir_to_irr(vlapic_vcpu(vlapic));
return vlapic_find_highest_vector(&vlapic->regs->data[APIC_IRR]);
}
@@ -1493,7 +1493,7 @@ static int lapic_save_regs(struct vcpu *v,
hvm_domain_context_t *h)
if ( !has_vlapic(v->domain) )
return 0;
- sync_pir_to_irr(v);
+ vlapic_sync_pir_to_irr(v);
return hvm_save_entry(LAPIC_REGS, v->vcpu_id, h, vcpu_vlapic(v)->regs);
}
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index b292e79382..88188d2d7f 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -341,7 +341,7 @@ int pt_irq_create_bind(
{
uint8_t dest, delivery_mode;
bool dest_mode;
- int dest_vcpu_id;
+ int dest_vcpu_id, prev_vcpu_id = -1;
const struct vcpu *vcpu;
uint32_t gflags = pt_irq_bind->u.msi.gflags &
~XEN_DOMCTL_VMSI_X86_UNMASKED;
@@ -411,6 +411,7 @@ int pt_irq_create_bind(
pirq_dpci->gmsi.gvec = pt_irq_bind->u.msi.gvec;
pirq_dpci->gmsi.gflags = gflags;
+ prev_vcpu_id = pirq_dpci->gmsi.dest_vcpu_id;
}
}
/* Calculate dest_vcpu_id for MSI-type pirq migration. */
@@ -440,7 +441,8 @@ int pt_irq_create_bind(
/* Use interrupt posting if it is supported. */
if ( iommu_intpost )
pi_update_irte(vcpu ? &vcpu->arch.hvm.vmx.pi_desc : NULL,
- info, pirq_dpci->gmsi.gvec);
+ info, pirq_dpci->gmsi.gvec,
+ prev_vcpu_id >= 0 ? d->vcpu[prev_vcpu_id] : NULL);
if ( pt_irq_bind->u.msi.gflags & XEN_DOMCTL_VMSI_X86_UNMASKED )
{
@@ -729,7 +731,9 @@ int pt_irq_destroy_bind(
what = "bogus";
}
else if ( pirq_dpci && pirq_dpci->gmsi.posted )
- pi_update_irte(NULL, pirq, 0);
+ pi_update_irte(NULL, pirq, 0,
+ pirq_dpci->gmsi.dest_vcpu_id >= 0
+ ? d->vcpu[pirq_dpci->gmsi.dest_vcpu_id] : NULL);
if ( pirq_dpci && (pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) &&
list_empty(&pirq_dpci->digl_list) )
diff --git a/xen/drivers/passthrough/vtd/intremap.c
b/xen/drivers/passthrough/vtd/intremap.c
index bf846195c4..ad03abb4da 100644
--- a/xen/drivers/passthrough/vtd/intremap.c
+++ b/xen/drivers/passthrough/vtd/intremap.c
@@ -951,7 +951,7 @@ void intel_iommu_disable_eim(void)
* when guest changes MSI/MSI-X information.
*/
int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq,
- const uint8_t gvec)
+ const uint8_t gvec, struct vcpu *prev)
{
struct irq_desc *desc;
struct msi_desc *msi_desc;
@@ -964,8 +964,8 @@ int pi_update_irte(const struct pi_desc *pi_desc, const
struct pirq *pirq,
msi_desc = desc->msi_desc;
if ( !msi_desc )
{
- rc = -ENODEV;
- goto unlock_out;
+ spin_unlock_irq(&desc->lock);
+ return -ENODEV;
}
msi_desc->pi_desc = pi_desc;
msi_desc->gvec = gvec;
@@ -974,10 +974,9 @@ int pi_update_irte(const struct pi_desc *pi_desc, const
struct pirq *pirq,
ASSERT(pcidevs_locked());
- return msi_msg_write_remap_rte(msi_desc, &msi_desc->msg);
-
- unlock_out:
- spin_unlock_irq(&desc->lock);
+ rc = msi_msg_write_remap_rte(msi_desc, &msi_desc->msg);
+ if ( !rc && prev )
+ vlapic_sync_pir_to_irr(prev);
return rc;
}
diff --git a/xen/include/asm-x86/hvm/vlapic.h b/xen/include/asm-x86/hvm/vlapic.h
index dde66b4f0f..b0017d1dae 100644
--- a/xen/include/asm-x86/hvm/vlapic.h
+++ b/xen/include/asm-x86/hvm/vlapic.h
@@ -150,4 +150,6 @@ bool_t vlapic_match_dest(
const struct vlapic *target, const struct vlapic *source,
int short_hand, uint32_t dest, bool_t dest_mode);
+void vlapic_sync_pir_to_irr(struct vcpu *v);
+
#endif /* __ASM_X86_HVM_VLAPIC_H__ */
diff --git a/xen/include/asm-x86/iommu.h b/xen/include/asm-x86/iommu.h
index 85741f7c96..314dcfbe47 100644
--- a/xen/include/asm-x86/iommu.h
+++ b/xen/include/asm-x86/iommu.h
@@ -119,7 +119,7 @@ static inline void iommu_disable_x2apic(void)
extern bool untrusted_msi;
int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq,
- const uint8_t gvec);
+ const uint8_t gvec, struct vcpu *prev);
#endif /* !__ARCH_X86_IOMMU_H__ */
/*
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |