Xen project Mailing List

Re: [Xen-devel] [PATCH v4 16/28] x86/vvtd: Add queued invalidation (QI) support

To: Roger Pau Monné <roger.pau@xxxxxxxxxx>

Date: Fri, 23 Feb 2018 12:38:00 +0800

Cc: Kevin Tian <kevin.tian@xxxxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Wei Liu <wei.liu2@xxxxxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxxxxx>, Ian Jackson <ian.jackson@xxxxxxxxxxxxx>, Tim Deegan <tim@xxxxxxx>, xen-devel@xxxxxxxxxxxxx, Jan Beulich <jbeulich@xxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Delivery-date: Fri, 23 Feb 2018 05:29:29 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On Mon, Feb 12, 2018 at 02:36:10PM +0000, Roger Pau Monné wrote: >On Fri, Nov 17, 2017 at 02:22:23PM +0800, Chao Gao wrote: >> Queued Invalidation Interface is an expanded invalidation interface with >> extended capabilities. Hardware implementations report support for queued >> invalidation interface through the Extended Capability Register. The queued >> invalidation interface uses an Invalidation Queue (IQ), which is a circular >> buffer in system memory. Software submits commands by writing Invalidation >> Descriptors to the IQ. >> >> In this patch, a new function viommu_process_iq() is used for emulating how >> hardware handles invalidation requests through QI. > >You should mention that QI is mandatory in order to support interrupt >remapping. Will do. > >I was about to ask whether QI could be deferred to a later stage, but >AFAICT this is not an option. > >> Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx> >> Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx> >> >> --- >> v4: >> - Introduce a lock to protect invalidation related registers. >> --- >> xen/drivers/passthrough/vtd/iommu.h | 24 +++- >> xen/drivers/passthrough/vtd/vvtd.c | 271 >> +++++++++++++++++++++++++++++++++++- >> 2 files changed, 293 insertions(+), 2 deletions(-) >> >> diff --git a/xen/drivers/passthrough/vtd/iommu.h >> b/xen/drivers/passthrough/vtd/iommu.h >> index b71dab8..de9188b 100644 >> --- a/xen/drivers/passthrough/vtd/iommu.h >> +++ b/xen/drivers/passthrough/vtd/iommu.h >> @@ -47,7 +47,12 @@ >> #define DMAR_IQH_REG 0x80 /* invalidation queue head */ >> #define DMAR_IQT_REG 0x88 /* invalidation queue tail */ >> #define DMAR_IQA_REG 0x90 /* invalidation queue addr */ >> +#define DMAR_IQUA_REG 0x94 /* invalidation queue upper addr */ >> +#define DMAR_ICS_REG 0x9c /* invalidation completion status */ >> #define DMAR_IECTL_REG 0xa0 /* invalidation event control register >> */ >> +#define DMAR_IEDATA_REG 0xa4 /* invalidation event data register */ >> +#define DMAR_IEADDR_REG 0xa8 /* invalidation event address register >> */ >> +#define DMAR_IEUADDR_REG 0xac /* upper address register */ >> #define DMAR_IRTA_REG 0xb8 /* base address of intr remap table */ >> #define DMAR_IRTUA_REG 0xbc /* upper address of intr remap table */ >> >> @@ -175,6 +180,21 @@ >> #define DMA_IRTA_S(val) (val & 0xf) >> #define DMA_IRTA_SIZE(val) (1UL << (DMA_IRTA_S(val) + 1)) >> >> +/* IQA_REG */ >> +#define DMA_IQA_ADDR(val) (val & ~0xfffULL) >> +#define DMA_IQA_QS(val) (val & 0x7) >> +#define DMA_IQA_RSVD 0xff8ULL >> + >> +/* IECTL_REG */ >> +#define DMA_IECTL_IM_SHIFT 31 >> +#define DMA_IECTL_IM (1U << DMA_IECTL_IM_SHIFT) >> +#define DMA_IECTL_IP_SHIFT 30 >> +#define DMA_IECTL_IP (1U << DMA_IECTL_IP_SHIFT) >> + >> +/* ICS_REG */ >> +#define DMA_ICS_IWC_SHIFT 0 >> +#define DMA_ICS_IWC (1U << DMA_ICS_IWC_SHIFT) >> + >> /* PMEN_REG */ >> #define DMA_PMEN_EPM (((u32)1) << 31) >> #define DMA_PMEN_PRS (((u32)1) << 0) >> @@ -205,13 +225,14 @@ >> /* FSTS_REG */ >> #define DMA_FSTS_PFO_SHIFT 0 >> #define DMA_FSTS_PPF_SHIFT 1 >> +#define DMA_FSTS_IQE_SHIFT 4 >> #define DMA_FSTS_PRO_SHIFT 7 >> >> #define DMA_FSTS_PFO ((uint32_t)1 << DMA_FSTS_PFO_SHIFT) >> #define DMA_FSTS_PPF ((uint32_t)1 << DMA_FSTS_PPF_SHIFT) >> #define DMA_FSTS_AFO ((uint32_t)1 << 2) >> #define DMA_FSTS_APF ((uint32_t)1 << 3) >> -#define DMA_FSTS_IQE ((uint32_t)1 << 4) >> +#define DMA_FSTS_IQE ((uint32_t)1 << DMA_FSTS_IQE_SHIFT) >> #define DMA_FSTS_ICE ((uint32_t)1 << 5) >> #define DMA_FSTS_ITE ((uint32_t)1 << 6) >> #define DMA_FSTS_PRO ((uint32_t)1 << DMA_FSTS_PRO_SHIFT) >> @@ -555,6 +576,7 @@ struct qinval_entry { >> >> /* Queue invalidation head/tail shift */ >> #define QINVAL_INDEX_SHIFT 4 >> +#define QINVAL_INDEX_MASK 0x7fff0ULL >> >> #define qinval_present(v) ((v).lo & 1) >> #define qinval_fault_disable(v) (((v).lo >> 1) & 1) >> diff --git a/xen/drivers/passthrough/vtd/vvtd.c >> b/xen/drivers/passthrough/vtd/vvtd.c >> index a2fa64a..81170ec 100644 >> --- a/xen/drivers/passthrough/vtd/vvtd.c >> +++ b/xen/drivers/passthrough/vtd/vvtd.c >> @@ -27,6 +27,7 @@ >> #include <asm/event.h> >> #include <asm/io_apic.h> >> #include <asm/hvm/domain.h> >> +#include <asm/hvm/support.h> >> #include <asm/p2m.h> >> >> #include "iommu.h" >> @@ -68,6 +69,9 @@ struct vvtd { >> >> struct hvm_hw_vvtd hw; >> void *irt_base; >> + void *inv_queue_base; > >Why not declare this as: > >struct qinval_entry * will do. > >> + /* This lock protects invalidation related registers */ >> + spinlock_t ie_lock; > >As noted in another patch, I think the first approach should be to use >a single lock that serializes access to the whole vIOMMU register >space. Later we can see about more fine grained locking. Seems you mean a coars-grained lock should be taken at the beginning of operations which read and write vIOMMU registers. It is supposed to be easy to add such lock. > >> }; >> >> /* Setting viommu_verbose enables debugging messages of vIOMMU */ >> @@ -284,6 +288,12 @@ static void vvtd_notify_fault(const struct vvtd *vvtd) >> vvtd_get_reg(vvtd, DMAR_FEDATA_REG)); >> } >> >> +static void vvtd_notify_inv_completion(const struct vvtd *vvtd) >> +{ >> + vvtd_generate_interrupt(vvtd, vvtd_get_reg_quad(vvtd, DMAR_IEADDR_REG), >> + vvtd_get_reg(vvtd, DMAR_IEDATA_REG)); >> +} >> + >> /* Computing the IRTE index for a given interrupt request. When success, >> return >> * 0 and set index to reference the corresponding IRTE. Otherwise, return < >> 0, >> * i.e. -1 when the irq request isn't an remapping format. >> @@ -478,6 +488,189 @@ static int vvtd_record_fault(struct vvtd *vvtd, >> return X86EMUL_OKAY; >> } >> >> +/* >> + * Process an invalidation descriptor. Currently, only two types >> descriptors, >> + * Interrupt Entry Cache Invalidation Descritor and Invalidation Wait >> + * Descriptor are handled. >> + * @vvtd: the virtual vtd instance >> + * @i: the index of the invalidation descriptor to be processed >> + * >> + * If success return 0, or return non-zero when failure. >> + */ >> +static int process_iqe(struct vvtd *vvtd, uint32_t i) >> +{ >> + struct qinval_entry qinval; >> + int ret = 0; >> + >> + if ( !vvtd->inv_queue_base ) >> + { >> + gdprintk(XENLOG_ERR, "Invalidation queue base isn't set\n"); >> + return -1; > >If you just return -1 or 0 please use bool instead. Or return proper >error codes. Will return meaningful error codes. > >> + } >> + qinval = ((struct qinval_entry *)vvtd->inv_queue_base)[i]; > >See my comment above regarding how inv_queue_base is declared, I'm not >sure why the copy is needed here. Don't need copy here. Will fix. > >> + >> + switch ( qinval.q.inv_wait_dsc.lo.type ) >> + { >> + case TYPE_INVAL_WAIT: >> + if ( qinval.q.inv_wait_dsc.lo.sw ) >> + { >> + uint32_t data = qinval.q.inv_wait_dsc.lo.sdata; >> + uint64_t addr = qinval.q.inv_wait_dsc.hi.saddr << 2; >> + >> + ret = hvm_copy_to_guest_phys(addr, &data, sizeof(data), >> current); >> + if ( ret ) >> + vvtd_info("Failed to write status address\n"); >> + } >> + >> + /* >> + * The following code generates an invalidation completion event >> + * indicating the invalidation wait descriptor completion. Note that >> + * the following code fragment is not tested properly. >> + */ >> + if ( qinval.q.inv_wait_dsc.lo.iflag ) >> + { >> + if ( !vvtd_test_and_set_bit(vvtd, DMAR_ICS_REG, >> DMA_ICS_IWC_SHIFT) ) >> + { >> + vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT); >> + if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG, >> DMA_IECTL_IM_SHIFT) ) >> + { >> + vvtd_notify_inv_completion(vvtd); >> + vvtd_clear_bit(vvtd, DMAR_IECTL_REG, >> DMA_IECTL_IP_SHIFT); >> + } >> + } >> + } >> + break; >> + >> + case TYPE_INVAL_IEC: >> + /* No cache is preserved in vvtd, nothing is needed to be flushed */ >> + break; >> + >> + default: >> + vvtd_debug("d%d: Invalidation type (%x) isn't supported\n", >> + vvtd->domain->domain_id, qinval.q.inv_wait_dsc.lo.type); >> + return -1; >> + } >> + >> + return ret; >> +} >> + >> +/* >> + * Invalidate all the descriptors in Invalidation Queue. >> + */ >> +static void vvtd_process_iq(struct vvtd *vvtd) >> +{ >> + uint32_t max_entry, i, iqh, iqt; >> + int err = 0; >> + >> + /* Trylock avoids more than 1 caller dealing with invalidation requests >> */ >> + if ( !spin_trylock(&vvtd->ie_lock) ) > >Uh, is this correct? You are returning without the queue being >invalidated AFAICT. QI is an asynchronous operation. Software can queue a special invalidation request to invalidation queue to get a notification from hardware when all requests before that spcial request are finished. Returing without invalidating queue is acceptable. Real invalidation is deferred to a suitable time when no interrupt is processing, no one is doing invalidation and guest isn't writing QI related registers. Anyway, if we use a coars-grained lock, 'ie_lock' will be removed and then invalidation will be done synchronously because there will be no in-flight requests. > >> + return; >> + >> + iqh = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQH_REG), >> QINVAL_INDEX_MASK); >> + iqt = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQT_REG), >> QINVAL_INDEX_MASK); >> + /* >> + * No new descriptor is fetched from the Invalidation Queue until >> + * software clears the IQE field in the Fault Status Register >> + */ >> + if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) ) >> + { >> + spin_unlock(&vvtd->ie_lock); >> + return; >> + } >> + >> + max_entry = 1 << (QINVAL_ENTRY_ORDER + >> + DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG))); >> + >> + ASSERT(iqt < max_entry); > >Is it possible for the user to write a valid value to DMAR_IQT_REG and >then change DMAR_IQA_REG in order to make the above ASSERT trigger? No. It isn't. > >> + if ( iqh == iqt ) >> + { >> + spin_unlock(&vvtd->ie_lock); >> + return; >> + } >> + >> + for ( i = iqh; i != iqt; i = (i + 1) % max_entry ) >> + { >> + err = process_iqe(vvtd, i); >> + if ( err ) >> + break; >> + } >> + >> + /* >> + * set IQH before checking error, because IQH should reference >> + * the desriptor associated with the error when an error is seen >> + * by guest >> + */ >> + vvtd_set_reg_quad(vvtd, DMAR_IQH_REG, i << QINVAL_INDEX_SHIFT); >> + >> + spin_unlock(&vvtd->ie_lock); >> + if ( err ) >> + { >> + spin_lock(&vvtd->fe_lock); >> + vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_IQE_SHIFT); >> + spin_unlock(&vvtd->fe_lock); >> + } >> +} >> + >> +static void vvtd_write_iqt(struct vvtd *vvtd, uint32_t val) >> +{ >> + uint32_t max_entry; >> + >> + if ( val & ~QINVAL_INDEX_MASK ) >> + { >> + vvtd_info("attempts to set reserved bits in IQT\n"); >> + return; >> + } >> + >> + max_entry = 1U << (QINVAL_ENTRY_ORDER + >> + DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG))); >> + if ( MASK_EXTR(val, QINVAL_INDEX_MASK) >= max_entry ) >> + { >> + vvtd_info("IQT: Value %x exceeded supported max index.", val); >> + return; >> + } >> + >> + vvtd_set_reg(vvtd, DMAR_IQT_REG, val); >> +} >> + >> +static void vvtd_write_iqa(struct vvtd *vvtd, uint32_t val, bool high) >> +{ >> + uint64_t cap = vvtd_get_reg_quad(vvtd, DMAR_CAP_REG); >> + uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG); >> + uint64_t new; >> + >> + if ( high ) >> + new = ((uint64_t)val << 32) | (old & 0xffffffff); >> + else >> + new = ((old >> 32) << 32) | val; > >You can also use old & ~0xffffffffUL > >> + >> + if ( new & (~((1ULL << cap_mgaw(cap)) - 1) | DMA_IQA_RSVD) ) >> + { >> + vvtd_info("Attempt to set reserved bits in IQA\n"); >> + return; >> + } >> + >> + vvtd_set_reg_quad(vvtd, DMAR_IQA_REG, new); >> + if ( high && !vvtd->inv_queue_base ) >> + vvtd->inv_queue_base = map_guest_pages(vvtd->domain, >> + PFN_DOWN(DMA_IQA_ADDR(new)), >> + 1 << DMA_IQA_QS(new)); > >Don't you need to pick a reference to this page(s)? Mapping guest pages already get a reference, needn't get a reference again. > >> + else if ( !high && vvtd->inv_queue_base ) > >I'm not sure I follow the logic with high here. Software can access 64-bit as either aligned quadwords or aligned doublewords. Here we set up mapping when writting to the upper double-word and destroy mapping when writting to the lower doubleword. > >> + { >> + unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old)); >> + vvtd->inv_queue_base = NULL; >> + } >> +} >> + >> +static void vvtd_write_ics(struct vvtd *vvtd, uint32_t val) >> +{ >> + if ( val & DMA_ICS_IWC ) >> + { >> + vvtd_clear_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT); >> + /* When IWC field is cleared, the IP field needs to be cleared */ >> + vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT); >> + } >> +} >> + >> static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val) >> { >> /* Writing a 1 means clear fault */ >> @@ -489,6 +682,20 @@ static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t >> val) >> return X86EMUL_OKAY; >> } >> >> +static void vvtd_write_iectl(struct vvtd *vvtd, uint32_t val) >> +{ >> + /* Only DMA_IECTL_IM bit is writable. Generate pending event when >> unmask */ >> + if ( !(val & DMA_IECTL_IM) ) >> + { >> + /* Clear IM and clear IP */ >> + vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT); >> + if ( vvtd_test_and_clear_bit(vvtd, DMAR_IECTL_REG, >> DMA_IECTL_IP_SHIFT) ) >> + vvtd_notify_inv_completion(vvtd); >> + } >> + else >> + vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT); >> +} >> + >> static void vvtd_write_fectl(struct vvtd *vvtd, uint32_t val) >> { >> /* >> @@ -681,6 +888,48 @@ static void vvtd_write_fault_regs(struct vvtd *vvtd, >> unsigned long val, >> spin_unlock(&vvtd->fe_lock); >> } >> >> +static void vvtd_write_invalidation_regs(struct vvtd *vvtd, unsigned long >> val, >> + unsigned int offset, unsigned int >> len) >> +{ >> + spin_lock(&vvtd->ie_lock); >> + for ( ; len ; len -= 4, offset += 4, val = val >> 32) > >Same comment as in the previous patch, I don't really like the for >loop, but I guess 64bit access must be allowed to these grup of >registers? Yes. It is allowed. But VT-d SPEC implies that hardware may disassemble 64bit write to two 32bit write. > >> + { >> + switch ( offset ) >> + { >> + case DMAR_IECTL_REG: >> + vvtd_write_iectl(vvtd, val); >> + break; >> + >> + case DMAR_ICS_REG: >> + vvtd_write_ics(vvtd, val); >> + break; >> + >> + case DMAR_IQT_REG: >> + vvtd_write_iqt(vvtd, val); >> + break; >> + >> + case DMAR_IQA_REG: >> + vvtd_write_iqa(vvtd, val, 0); >> + break; >> + >> + case DMAR_IQUA_REG: >> + vvtd_write_iqa(vvtd, val, 1); >> + break; >> + >> + case DMAR_IEDATA_REG: >> + case DMAR_IEADDR_REG: >> + case DMAR_IEUADDR_REG: >> + vvtd_set_reg(vvtd, offset, val); >> + break; >> + >> + default: >> + break; >> + } >> + } >> + spin_unlock(&vvtd->ie_lock); >> + >> +} >> + >> static int vvtd_write(struct vcpu *v, unsigned long addr, >> unsigned int len, unsigned long val) >> { >> @@ -719,6 +968,17 @@ static int vvtd_write(struct vcpu *v, unsigned long >> addr, >> vvtd_write_fault_regs(vvtd, val, offset, len); >> break; >> >> + case DMAR_IECTL_REG: >> + case DMAR_ICS_REG: >> + case DMAR_IQT_REG: >> + case DMAR_IQA_REG: >> + case DMAR_IQUA_REG: >> + case DMAR_IEDATA_REG: >> + case DMAR_IEADDR_REG: >> + case DMAR_IEUADDR_REG: >> + vvtd_write_invalidation_regs(vvtd, val, offset, len); >> + break; >> + >> default: >> if ( (offset == (fault_offset + DMA_FRCD2_OFFSET)) || >> (offset == (fault_offset + DMA_FRCD3_OFFSET)) ) >> @@ -840,7 +1100,8 @@ static int vvtd_handle_irq_request(const struct domain >> *d, >> irte.remap.tm); >> >> out: >> - atomic_dec(&vvtd->inflight_intr); >> + if ( !atomic_dec_and_test(&vvtd->inflight_intr) ) >> + vvtd_process_iq(vvtd); >> return ret; >> } >> >> @@ -911,6 +1172,7 @@ static int vvtd_create(struct domain *d, struct viommu >> *viommu) >> vvtd->domain = d; >> register_mmio_handler(d, &vvtd_mmio_ops); >> spin_lock_init(&vvtd->fe_lock); >> + spin_lock_init(&vvtd->ie_lock); >> >> viommu->priv = vvtd; >> >> @@ -930,6 +1192,13 @@ static int vvtd_destroy(struct viommu *viommu) >> sizeof(struct iremap_entry))); >> vvtd->irt_base = NULL; >> } >> + if ( vvtd->inv_queue_base ) >> + { >> + uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG); >> + >> + unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old)); > >Don't you also need to unmap this page(s) when QIE is disabled? Will do. Thanks Chao _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.