Xen project Mailing List

Re: [Xen-devel] [PATCH v4 16/28] x86/vvtd: Add queued invalidation (QI) support

On Fri, Nov 17, 2017 at 02:22:23PM +0800, Chao Gao wrote: > Queued Invalidation Interface is an expanded invalidation interface with > extended capabilities. Hardware implementations report support for queued > invalidation interface through the Extended Capability Register. The queued > invalidation interface uses an Invalidation Queue (IQ), which is a circular > buffer in system memory. Software submits commands by writing Invalidation > Descriptors to the IQ. > > In this patch, a new function viommu_process_iq() is used for emulating how > hardware handles invalidation requests through QI. You should mention that QI is mandatory in order to support interrupt remapping. I was about to ask whether QI could be deferred to a later stage, but AFAICT this is not an option. > Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx> > Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx> > > --- > v4: > - Introduce a lock to protect invalidation related registers. > --- > xen/drivers/passthrough/vtd/iommu.h | 24 +++- > xen/drivers/passthrough/vtd/vvtd.c | 271 > +++++++++++++++++++++++++++++++++++- > 2 files changed, 293 insertions(+), 2 deletions(-) > > diff --git a/xen/drivers/passthrough/vtd/iommu.h > b/xen/drivers/passthrough/vtd/iommu.h > index b71dab8..de9188b 100644 > --- a/xen/drivers/passthrough/vtd/iommu.h > +++ b/xen/drivers/passthrough/vtd/iommu.h > @@ -47,7 +47,12 @@ > #define DMAR_IQH_REG 0x80 /* invalidation queue head */ > #define DMAR_IQT_REG 0x88 /* invalidation queue tail */ > #define DMAR_IQA_REG 0x90 /* invalidation queue addr */ > +#define DMAR_IQUA_REG 0x94 /* invalidation queue upper addr */ > +#define DMAR_ICS_REG 0x9c /* invalidation completion status */ > #define DMAR_IECTL_REG 0xa0 /* invalidation event control register > */ > +#define DMAR_IEDATA_REG 0xa4 /* invalidation event data register */ > +#define DMAR_IEADDR_REG 0xa8 /* invalidation event address register > */ > +#define DMAR_IEUADDR_REG 0xac /* upper address register */ > #define DMAR_IRTA_REG 0xb8 /* base address of intr remap table */ > #define DMAR_IRTUA_REG 0xbc /* upper address of intr remap table */ > > @@ -175,6 +180,21 @@ > #define DMA_IRTA_S(val) (val & 0xf) > #define DMA_IRTA_SIZE(val) (1UL << (DMA_IRTA_S(val) + 1)) > > +/* IQA_REG */ > +#define DMA_IQA_ADDR(val) (val & ~0xfffULL) > +#define DMA_IQA_QS(val) (val & 0x7) > +#define DMA_IQA_RSVD 0xff8ULL > + > +/* IECTL_REG */ > +#define DMA_IECTL_IM_SHIFT 31 > +#define DMA_IECTL_IM (1U << DMA_IECTL_IM_SHIFT) > +#define DMA_IECTL_IP_SHIFT 30 > +#define DMA_IECTL_IP (1U << DMA_IECTL_IP_SHIFT) > + > +/* ICS_REG */ > +#define DMA_ICS_IWC_SHIFT 0 > +#define DMA_ICS_IWC (1U << DMA_ICS_IWC_SHIFT) > + > /* PMEN_REG */ > #define DMA_PMEN_EPM (((u32)1) << 31) > #define DMA_PMEN_PRS (((u32)1) << 0) > @@ -205,13 +225,14 @@ > /* FSTS_REG */ > #define DMA_FSTS_PFO_SHIFT 0 > #define DMA_FSTS_PPF_SHIFT 1 > +#define DMA_FSTS_IQE_SHIFT 4 > #define DMA_FSTS_PRO_SHIFT 7 > > #define DMA_FSTS_PFO ((uint32_t)1 << DMA_FSTS_PFO_SHIFT) > #define DMA_FSTS_PPF ((uint32_t)1 << DMA_FSTS_PPF_SHIFT) > #define DMA_FSTS_AFO ((uint32_t)1 << 2) > #define DMA_FSTS_APF ((uint32_t)1 << 3) > -#define DMA_FSTS_IQE ((uint32_t)1 << 4) > +#define DMA_FSTS_IQE ((uint32_t)1 << DMA_FSTS_IQE_SHIFT) > #define DMA_FSTS_ICE ((uint32_t)1 << 5) > #define DMA_FSTS_ITE ((uint32_t)1 << 6) > #define DMA_FSTS_PRO ((uint32_t)1 << DMA_FSTS_PRO_SHIFT) > @@ -555,6 +576,7 @@ struct qinval_entry { > > /* Queue invalidation head/tail shift */ > #define QINVAL_INDEX_SHIFT 4 > +#define QINVAL_INDEX_MASK 0x7fff0ULL > > #define qinval_present(v) ((v).lo & 1) > #define qinval_fault_disable(v) (((v).lo >> 1) & 1) > diff --git a/xen/drivers/passthrough/vtd/vvtd.c > b/xen/drivers/passthrough/vtd/vvtd.c > index a2fa64a..81170ec 100644 > --- a/xen/drivers/passthrough/vtd/vvtd.c > +++ b/xen/drivers/passthrough/vtd/vvtd.c > @@ -27,6 +27,7 @@ > #include <asm/event.h> > #include <asm/io_apic.h> > #include <asm/hvm/domain.h> > +#include <asm/hvm/support.h> > #include <asm/p2m.h> > > #include "iommu.h" > @@ -68,6 +69,9 @@ struct vvtd { > > struct hvm_hw_vvtd hw; > void *irt_base; > + void *inv_queue_base; Why not declare this as: struct qinval_entry * > + /* This lock protects invalidation related registers */ > + spinlock_t ie_lock; As noted in another patch, I think the first approach should be to use a single lock that serializes access to the whole vIOMMU register space. Later we can see about more fine grained locking. > }; > > /* Setting viommu_verbose enables debugging messages of vIOMMU */ > @@ -284,6 +288,12 @@ static void vvtd_notify_fault(const struct vvtd *vvtd) > vvtd_get_reg(vvtd, DMAR_FEDATA_REG)); > } > > +static void vvtd_notify_inv_completion(const struct vvtd *vvtd) > +{ > + vvtd_generate_interrupt(vvtd, vvtd_get_reg_quad(vvtd, DMAR_IEADDR_REG), > + vvtd_get_reg(vvtd, DMAR_IEDATA_REG)); > +} > + > /* Computing the IRTE index for a given interrupt request. When success, > return > * 0 and set index to reference the corresponding IRTE. Otherwise, return < > 0, > * i.e. -1 when the irq request isn't an remapping format. > @@ -478,6 +488,189 @@ static int vvtd_record_fault(struct vvtd *vvtd, > return X86EMUL_OKAY; > } > > +/* > + * Process an invalidation descriptor. Currently, only two types descriptors, > + * Interrupt Entry Cache Invalidation Descritor and Invalidation Wait > + * Descriptor are handled. > + * @vvtd: the virtual vtd instance > + * @i: the index of the invalidation descriptor to be processed > + * > + * If success return 0, or return non-zero when failure. > + */ > +static int process_iqe(struct vvtd *vvtd, uint32_t i) > +{ > + struct qinval_entry qinval; > + int ret = 0; > + > + if ( !vvtd->inv_queue_base ) > + { > + gdprintk(XENLOG_ERR, "Invalidation queue base isn't set\n"); > + return -1; If you just return -1 or 0 please use bool instead. Or return proper error codes. > + } > + qinval = ((struct qinval_entry *)vvtd->inv_queue_base)[i]; See my comment above regarding how inv_queue_base is declared, I'm not sure why the copy is needed here. > + > + switch ( qinval.q.inv_wait_dsc.lo.type ) > + { > + case TYPE_INVAL_WAIT: > + if ( qinval.q.inv_wait_dsc.lo.sw ) > + { > + uint32_t data = qinval.q.inv_wait_dsc.lo.sdata; > + uint64_t addr = qinval.q.inv_wait_dsc.hi.saddr << 2; > + > + ret = hvm_copy_to_guest_phys(addr, &data, sizeof(data), current); > + if ( ret ) > + vvtd_info("Failed to write status address\n"); > + } > + > + /* > + * The following code generates an invalidation completion event > + * indicating the invalidation wait descriptor completion. Note that > + * the following code fragment is not tested properly. > + */ > + if ( qinval.q.inv_wait_dsc.lo.iflag ) > + { > + if ( !vvtd_test_and_set_bit(vvtd, DMAR_ICS_REG, > DMA_ICS_IWC_SHIFT) ) > + { > + vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT); > + if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG, > DMA_IECTL_IM_SHIFT) ) > + { > + vvtd_notify_inv_completion(vvtd); > + vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT); > + } > + } > + } > + break; > + > + case TYPE_INVAL_IEC: > + /* No cache is preserved in vvtd, nothing is needed to be flushed */ > + break; > + > + default: > + vvtd_debug("d%d: Invalidation type (%x) isn't supported\n", > + vvtd->domain->domain_id, qinval.q.inv_wait_dsc.lo.type); > + return -1; > + } > + > + return ret; > +} > + > +/* > + * Invalidate all the descriptors in Invalidation Queue. > + */ > +static void vvtd_process_iq(struct vvtd *vvtd) > +{ > + uint32_t max_entry, i, iqh, iqt; > + int err = 0; > + > + /* Trylock avoids more than 1 caller dealing with invalidation requests > */ > + if ( !spin_trylock(&vvtd->ie_lock) ) Uh, is this correct? You are returning without the queue being invalidated AFAICT. > + return; > + > + iqh = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQH_REG), > QINVAL_INDEX_MASK); > + iqt = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQT_REG), > QINVAL_INDEX_MASK); > + /* > + * No new descriptor is fetched from the Invalidation Queue until > + * software clears the IQE field in the Fault Status Register > + */ > + if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) ) > + { > + spin_unlock(&vvtd->ie_lock); > + return; > + } > + > + max_entry = 1 << (QINVAL_ENTRY_ORDER + > + DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG))); > + > + ASSERT(iqt < max_entry); Is it possible for the user to write a valid value to DMAR_IQT_REG and then change DMAR_IQA_REG in order to make the above ASSERT trigger? > + if ( iqh == iqt ) > + { > + spin_unlock(&vvtd->ie_lock); > + return; > + } > + > + for ( i = iqh; i != iqt; i = (i + 1) % max_entry ) > + { > + err = process_iqe(vvtd, i); > + if ( err ) > + break; > + } > + > + /* > + * set IQH before checking error, because IQH should reference > + * the desriptor associated with the error when an error is seen > + * by guest > + */ > + vvtd_set_reg_quad(vvtd, DMAR_IQH_REG, i << QINVAL_INDEX_SHIFT); > + > + spin_unlock(&vvtd->ie_lock); > + if ( err ) > + { > + spin_lock(&vvtd->fe_lock); > + vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_IQE_SHIFT); > + spin_unlock(&vvtd->fe_lock); > + } > +} > + > +static void vvtd_write_iqt(struct vvtd *vvtd, uint32_t val) > +{ > + uint32_t max_entry; > + > + if ( val & ~QINVAL_INDEX_MASK ) > + { > + vvtd_info("attempts to set reserved bits in IQT\n"); > + return; > + } > + > + max_entry = 1U << (QINVAL_ENTRY_ORDER + > + DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG))); > + if ( MASK_EXTR(val, QINVAL_INDEX_MASK) >= max_entry ) > + { > + vvtd_info("IQT: Value %x exceeded supported max index.", val); > + return; > + } > + > + vvtd_set_reg(vvtd, DMAR_IQT_REG, val); > +} > + > +static void vvtd_write_iqa(struct vvtd *vvtd, uint32_t val, bool high) > +{ > + uint64_t cap = vvtd_get_reg_quad(vvtd, DMAR_CAP_REG); > + uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG); > + uint64_t new; > + > + if ( high ) > + new = ((uint64_t)val << 32) | (old & 0xffffffff); > + else > + new = ((old >> 32) << 32) | val; You can also use old & ~0xffffffffUL > + > + if ( new & (~((1ULL << cap_mgaw(cap)) - 1) | DMA_IQA_RSVD) ) > + { > + vvtd_info("Attempt to set reserved bits in IQA\n"); > + return; > + } > + > + vvtd_set_reg_quad(vvtd, DMAR_IQA_REG, new); > + if ( high && !vvtd->inv_queue_base ) > + vvtd->inv_queue_base = map_guest_pages(vvtd->domain, > + PFN_DOWN(DMA_IQA_ADDR(new)), > + 1 << DMA_IQA_QS(new)); Don't you need to pick a reference to this page(s)? > + else if ( !high && vvtd->inv_queue_base ) I'm not sure I follow the logic with high here. > + { > + unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old)); > + vvtd->inv_queue_base = NULL; > + } > +} > + > +static void vvtd_write_ics(struct vvtd *vvtd, uint32_t val) > +{ > + if ( val & DMA_ICS_IWC ) > + { > + vvtd_clear_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT); > + /* When IWC field is cleared, the IP field needs to be cleared */ > + vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT); > + } > +} > + > static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val) > { > /* Writing a 1 means clear fault */ > @@ -489,6 +682,20 @@ static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t > val) > return X86EMUL_OKAY; > } > > +static void vvtd_write_iectl(struct vvtd *vvtd, uint32_t val) > +{ > + /* Only DMA_IECTL_IM bit is writable. Generate pending event when unmask > */ > + if ( !(val & DMA_IECTL_IM) ) > + { > + /* Clear IM and clear IP */ > + vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT); > + if ( vvtd_test_and_clear_bit(vvtd, DMAR_IECTL_REG, > DMA_IECTL_IP_SHIFT) ) > + vvtd_notify_inv_completion(vvtd); > + } > + else > + vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT); > +} > + > static void vvtd_write_fectl(struct vvtd *vvtd, uint32_t val) > { > /* > @@ -681,6 +888,48 @@ static void vvtd_write_fault_regs(struct vvtd *vvtd, > unsigned long val, > spin_unlock(&vvtd->fe_lock); > } > > +static void vvtd_write_invalidation_regs(struct vvtd *vvtd, unsigned long > val, > + unsigned int offset, unsigned int > len) > +{ > + spin_lock(&vvtd->ie_lock); > + for ( ; len ; len -= 4, offset += 4, val = val >> 32) Same comment as in the previous patch, I don't really like the for loop, but I guess 64bit access must be allowed to these grup of registers? > + { > + switch ( offset ) > + { > + case DMAR_IECTL_REG: > + vvtd_write_iectl(vvtd, val); > + break; > + > + case DMAR_ICS_REG: > + vvtd_write_ics(vvtd, val); > + break; > + > + case DMAR_IQT_REG: > + vvtd_write_iqt(vvtd, val); > + break; > + > + case DMAR_IQA_REG: > + vvtd_write_iqa(vvtd, val, 0); > + break; > + > + case DMAR_IQUA_REG: > + vvtd_write_iqa(vvtd, val, 1); > + break; > + > + case DMAR_IEDATA_REG: > + case DMAR_IEADDR_REG: > + case DMAR_IEUADDR_REG: > + vvtd_set_reg(vvtd, offset, val); > + break; > + > + default: > + break; > + } > + } > + spin_unlock(&vvtd->ie_lock); > + > +} > + > static int vvtd_write(struct vcpu *v, unsigned long addr, > unsigned int len, unsigned long val) > { > @@ -719,6 +968,17 @@ static int vvtd_write(struct vcpu *v, unsigned long addr, > vvtd_write_fault_regs(vvtd, val, offset, len); > break; > > + case DMAR_IECTL_REG: > + case DMAR_ICS_REG: > + case DMAR_IQT_REG: > + case DMAR_IQA_REG: > + case DMAR_IQUA_REG: > + case DMAR_IEDATA_REG: > + case DMAR_IEADDR_REG: > + case DMAR_IEUADDR_REG: > + vvtd_write_invalidation_regs(vvtd, val, offset, len); > + break; > + > default: > if ( (offset == (fault_offset + DMA_FRCD2_OFFSET)) || > (offset == (fault_offset + DMA_FRCD3_OFFSET)) ) > @@ -840,7 +1100,8 @@ static int vvtd_handle_irq_request(const struct domain > *d, > irte.remap.tm); > > out: > - atomic_dec(&vvtd->inflight_intr); > + if ( !atomic_dec_and_test(&vvtd->inflight_intr) ) > + vvtd_process_iq(vvtd); > return ret; > } > > @@ -911,6 +1172,7 @@ static int vvtd_create(struct domain *d, struct viommu > *viommu) > vvtd->domain = d; > register_mmio_handler(d, &vvtd_mmio_ops); > spin_lock_init(&vvtd->fe_lock); > + spin_lock_init(&vvtd->ie_lock); > > viommu->priv = vvtd; > > @@ -930,6 +1192,13 @@ static int vvtd_destroy(struct viommu *viommu) > sizeof(struct iremap_entry))); > vvtd->irt_base = NULL; > } > + if ( vvtd->inv_queue_base ) > + { > + uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG); > + > + unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old)); Don't you also need to unmap this page(s) when QIE is disabled? Thanks, Roger. _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.