|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH v4 16/28] x86/vvtd: Add queued invalidation (QI) support
On Mon, Feb 12, 2018 at 02:36:10PM +0000, Roger Pau Monné wrote:
>On Fri, Nov 17, 2017 at 02:22:23PM +0800, Chao Gao wrote:
>> Queued Invalidation Interface is an expanded invalidation interface with
>> extended capabilities. Hardware implementations report support for queued
>> invalidation interface through the Extended Capability Register. The queued
>> invalidation interface uses an Invalidation Queue (IQ), which is a circular
>> buffer in system memory. Software submits commands by writing Invalidation
>> Descriptors to the IQ.
>>
>> In this patch, a new function viommu_process_iq() is used for emulating how
>> hardware handles invalidation requests through QI.
>
>You should mention that QI is mandatory in order to support interrupt
>remapping.
Will do.
>
>I was about to ask whether QI could be deferred to a later stage, but
>AFAICT this is not an option.
>
>> Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx>
>> Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
>>
>> ---
>> v4:
>> - Introduce a lock to protect invalidation related registers.
>> ---
>> xen/drivers/passthrough/vtd/iommu.h | 24 +++-
>> xen/drivers/passthrough/vtd/vvtd.c | 271
>> +++++++++++++++++++++++++++++++++++-
>> 2 files changed, 293 insertions(+), 2 deletions(-)
>>
>> diff --git a/xen/drivers/passthrough/vtd/iommu.h
>> b/xen/drivers/passthrough/vtd/iommu.h
>> index b71dab8..de9188b 100644
>> --- a/xen/drivers/passthrough/vtd/iommu.h
>> +++ b/xen/drivers/passthrough/vtd/iommu.h
>> @@ -47,7 +47,12 @@
>> #define DMAR_IQH_REG 0x80 /* invalidation queue head */
>> #define DMAR_IQT_REG 0x88 /* invalidation queue tail */
>> #define DMAR_IQA_REG 0x90 /* invalidation queue addr */
>> +#define DMAR_IQUA_REG 0x94 /* invalidation queue upper addr */
>> +#define DMAR_ICS_REG 0x9c /* invalidation completion status */
>> #define DMAR_IECTL_REG 0xa0 /* invalidation event control register
>> */
>> +#define DMAR_IEDATA_REG 0xa4 /* invalidation event data register */
>> +#define DMAR_IEADDR_REG 0xa8 /* invalidation event address register
>> */
>> +#define DMAR_IEUADDR_REG 0xac /* upper address register */
>> #define DMAR_IRTA_REG 0xb8 /* base address of intr remap table */
>> #define DMAR_IRTUA_REG 0xbc /* upper address of intr remap table */
>>
>> @@ -175,6 +180,21 @@
>> #define DMA_IRTA_S(val) (val & 0xf)
>> #define DMA_IRTA_SIZE(val) (1UL << (DMA_IRTA_S(val) + 1))
>>
>> +/* IQA_REG */
>> +#define DMA_IQA_ADDR(val) (val & ~0xfffULL)
>> +#define DMA_IQA_QS(val) (val & 0x7)
>> +#define DMA_IQA_RSVD 0xff8ULL
>> +
>> +/* IECTL_REG */
>> +#define DMA_IECTL_IM_SHIFT 31
>> +#define DMA_IECTL_IM (1U << DMA_IECTL_IM_SHIFT)
>> +#define DMA_IECTL_IP_SHIFT 30
>> +#define DMA_IECTL_IP (1U << DMA_IECTL_IP_SHIFT)
>> +
>> +/* ICS_REG */
>> +#define DMA_ICS_IWC_SHIFT 0
>> +#define DMA_ICS_IWC (1U << DMA_ICS_IWC_SHIFT)
>> +
>> /* PMEN_REG */
>> #define DMA_PMEN_EPM (((u32)1) << 31)
>> #define DMA_PMEN_PRS (((u32)1) << 0)
>> @@ -205,13 +225,14 @@
>> /* FSTS_REG */
>> #define DMA_FSTS_PFO_SHIFT 0
>> #define DMA_FSTS_PPF_SHIFT 1
>> +#define DMA_FSTS_IQE_SHIFT 4
>> #define DMA_FSTS_PRO_SHIFT 7
>>
>> #define DMA_FSTS_PFO ((uint32_t)1 << DMA_FSTS_PFO_SHIFT)
>> #define DMA_FSTS_PPF ((uint32_t)1 << DMA_FSTS_PPF_SHIFT)
>> #define DMA_FSTS_AFO ((uint32_t)1 << 2)
>> #define DMA_FSTS_APF ((uint32_t)1 << 3)
>> -#define DMA_FSTS_IQE ((uint32_t)1 << 4)
>> +#define DMA_FSTS_IQE ((uint32_t)1 << DMA_FSTS_IQE_SHIFT)
>> #define DMA_FSTS_ICE ((uint32_t)1 << 5)
>> #define DMA_FSTS_ITE ((uint32_t)1 << 6)
>> #define DMA_FSTS_PRO ((uint32_t)1 << DMA_FSTS_PRO_SHIFT)
>> @@ -555,6 +576,7 @@ struct qinval_entry {
>>
>> /* Queue invalidation head/tail shift */
>> #define QINVAL_INDEX_SHIFT 4
>> +#define QINVAL_INDEX_MASK 0x7fff0ULL
>>
>> #define qinval_present(v) ((v).lo & 1)
>> #define qinval_fault_disable(v) (((v).lo >> 1) & 1)
>> diff --git a/xen/drivers/passthrough/vtd/vvtd.c
>> b/xen/drivers/passthrough/vtd/vvtd.c
>> index a2fa64a..81170ec 100644
>> --- a/xen/drivers/passthrough/vtd/vvtd.c
>> +++ b/xen/drivers/passthrough/vtd/vvtd.c
>> @@ -27,6 +27,7 @@
>> #include <asm/event.h>
>> #include <asm/io_apic.h>
>> #include <asm/hvm/domain.h>
>> +#include <asm/hvm/support.h>
>> #include <asm/p2m.h>
>>
>> #include "iommu.h"
>> @@ -68,6 +69,9 @@ struct vvtd {
>>
>> struct hvm_hw_vvtd hw;
>> void *irt_base;
>> + void *inv_queue_base;
>
>Why not declare this as:
>
>struct qinval_entry *
will do.
>
>> + /* This lock protects invalidation related registers */
>> + spinlock_t ie_lock;
>
>As noted in another patch, I think the first approach should be to use
>a single lock that serializes access to the whole vIOMMU register
>space. Later we can see about more fine grained locking.
Seems you mean a coars-grained lock should be taken at the beginning of
operations
which read and write vIOMMU registers. It is supposed to be easy to add such
lock.
>
>> };
>>
>> /* Setting viommu_verbose enables debugging messages of vIOMMU */
>> @@ -284,6 +288,12 @@ static void vvtd_notify_fault(const struct vvtd *vvtd)
>> vvtd_get_reg(vvtd, DMAR_FEDATA_REG));
>> }
>>
>> +static void vvtd_notify_inv_completion(const struct vvtd *vvtd)
>> +{
>> + vvtd_generate_interrupt(vvtd, vvtd_get_reg_quad(vvtd, DMAR_IEADDR_REG),
>> + vvtd_get_reg(vvtd, DMAR_IEDATA_REG));
>> +}
>> +
>> /* Computing the IRTE index for a given interrupt request. When success,
>> return
>> * 0 and set index to reference the corresponding IRTE. Otherwise, return <
>> 0,
>> * i.e. -1 when the irq request isn't an remapping format.
>> @@ -478,6 +488,189 @@ static int vvtd_record_fault(struct vvtd *vvtd,
>> return X86EMUL_OKAY;
>> }
>>
>> +/*
>> + * Process an invalidation descriptor. Currently, only two types
>> descriptors,
>> + * Interrupt Entry Cache Invalidation Descritor and Invalidation Wait
>> + * Descriptor are handled.
>> + * @vvtd: the virtual vtd instance
>> + * @i: the index of the invalidation descriptor to be processed
>> + *
>> + * If success return 0, or return non-zero when failure.
>> + */
>> +static int process_iqe(struct vvtd *vvtd, uint32_t i)
>> +{
>> + struct qinval_entry qinval;
>> + int ret = 0;
>> +
>> + if ( !vvtd->inv_queue_base )
>> + {
>> + gdprintk(XENLOG_ERR, "Invalidation queue base isn't set\n");
>> + return -1;
>
>If you just return -1 or 0 please use bool instead. Or return proper
>error codes.
Will return meaningful error codes.
>
>> + }
>> + qinval = ((struct qinval_entry *)vvtd->inv_queue_base)[i];
>
>See my comment above regarding how inv_queue_base is declared, I'm not
>sure why the copy is needed here.
Don't need copy here. Will fix.
>
>> +
>> + switch ( qinval.q.inv_wait_dsc.lo.type )
>> + {
>> + case TYPE_INVAL_WAIT:
>> + if ( qinval.q.inv_wait_dsc.lo.sw )
>> + {
>> + uint32_t data = qinval.q.inv_wait_dsc.lo.sdata;
>> + uint64_t addr = qinval.q.inv_wait_dsc.hi.saddr << 2;
>> +
>> + ret = hvm_copy_to_guest_phys(addr, &data, sizeof(data),
>> current);
>> + if ( ret )
>> + vvtd_info("Failed to write status address\n");
>> + }
>> +
>> + /*
>> + * The following code generates an invalidation completion event
>> + * indicating the invalidation wait descriptor completion. Note that
>> + * the following code fragment is not tested properly.
>> + */
>> + if ( qinval.q.inv_wait_dsc.lo.iflag )
>> + {
>> + if ( !vvtd_test_and_set_bit(vvtd, DMAR_ICS_REG,
>> DMA_ICS_IWC_SHIFT) )
>> + {
>> + vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
>> + if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG,
>> DMA_IECTL_IM_SHIFT) )
>> + {
>> + vvtd_notify_inv_completion(vvtd);
>> + vvtd_clear_bit(vvtd, DMAR_IECTL_REG,
>> DMA_IECTL_IP_SHIFT);
>> + }
>> + }
>> + }
>> + break;
>> +
>> + case TYPE_INVAL_IEC:
>> + /* No cache is preserved in vvtd, nothing is needed to be flushed */
>> + break;
>> +
>> + default:
>> + vvtd_debug("d%d: Invalidation type (%x) isn't supported\n",
>> + vvtd->domain->domain_id, qinval.q.inv_wait_dsc.lo.type);
>> + return -1;
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * Invalidate all the descriptors in Invalidation Queue.
>> + */
>> +static void vvtd_process_iq(struct vvtd *vvtd)
>> +{
>> + uint32_t max_entry, i, iqh, iqt;
>> + int err = 0;
>> +
>> + /* Trylock avoids more than 1 caller dealing with invalidation requests
>> */
>> + if ( !spin_trylock(&vvtd->ie_lock) )
>
>Uh, is this correct? You are returning without the queue being
>invalidated AFAICT.
QI is an asynchronous operation. Software can queue a special
invalidation request to invalidation queue to get a notification from
hardware when all requests before that spcial request are finished.
Returing without invalidating queue is acceptable. Real invalidation is
deferred to a suitable time when no interrupt is processing, no one
is doing invalidation and guest isn't writing QI related registers.
Anyway, if we use a coars-grained lock, 'ie_lock' will be removed and then
invalidation will be done synchronously because there will be no in-flight
requests.
>
>> + return;
>> +
>> + iqh = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQH_REG),
>> QINVAL_INDEX_MASK);
>> + iqt = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQT_REG),
>> QINVAL_INDEX_MASK);
>> + /*
>> + * No new descriptor is fetched from the Invalidation Queue until
>> + * software clears the IQE field in the Fault Status Register
>> + */
>> + if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) )
>> + {
>> + spin_unlock(&vvtd->ie_lock);
>> + return;
>> + }
>> +
>> + max_entry = 1 << (QINVAL_ENTRY_ORDER +
>> + DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG)));
>> +
>> + ASSERT(iqt < max_entry);
>
>Is it possible for the user to write a valid value to DMAR_IQT_REG and
>then change DMAR_IQA_REG in order to make the above ASSERT trigger?
No. It isn't.
>
>> + if ( iqh == iqt )
>> + {
>> + spin_unlock(&vvtd->ie_lock);
>> + return;
>> + }
>> +
>> + for ( i = iqh; i != iqt; i = (i + 1) % max_entry )
>> + {
>> + err = process_iqe(vvtd, i);
>> + if ( err )
>> + break;
>> + }
>> +
>> + /*
>> + * set IQH before checking error, because IQH should reference
>> + * the desriptor associated with the error when an error is seen
>> + * by guest
>> + */
>> + vvtd_set_reg_quad(vvtd, DMAR_IQH_REG, i << QINVAL_INDEX_SHIFT);
>> +
>> + spin_unlock(&vvtd->ie_lock);
>> + if ( err )
>> + {
>> + spin_lock(&vvtd->fe_lock);
>> + vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_IQE_SHIFT);
>> + spin_unlock(&vvtd->fe_lock);
>> + }
>> +}
>> +
>> +static void vvtd_write_iqt(struct vvtd *vvtd, uint32_t val)
>> +{
>> + uint32_t max_entry;
>> +
>> + if ( val & ~QINVAL_INDEX_MASK )
>> + {
>> + vvtd_info("attempts to set reserved bits in IQT\n");
>> + return;
>> + }
>> +
>> + max_entry = 1U << (QINVAL_ENTRY_ORDER +
>> + DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG)));
>> + if ( MASK_EXTR(val, QINVAL_INDEX_MASK) >= max_entry )
>> + {
>> + vvtd_info("IQT: Value %x exceeded supported max index.", val);
>> + return;
>> + }
>> +
>> + vvtd_set_reg(vvtd, DMAR_IQT_REG, val);
>> +}
>> +
>> +static void vvtd_write_iqa(struct vvtd *vvtd, uint32_t val, bool high)
>> +{
>> + uint64_t cap = vvtd_get_reg_quad(vvtd, DMAR_CAP_REG);
>> + uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
>> + uint64_t new;
>> +
>> + if ( high )
>> + new = ((uint64_t)val << 32) | (old & 0xffffffff);
>> + else
>> + new = ((old >> 32) << 32) | val;
>
>You can also use old & ~0xffffffffUL
>
>> +
>> + if ( new & (~((1ULL << cap_mgaw(cap)) - 1) | DMA_IQA_RSVD) )
>> + {
>> + vvtd_info("Attempt to set reserved bits in IQA\n");
>> + return;
>> + }
>> +
>> + vvtd_set_reg_quad(vvtd, DMAR_IQA_REG, new);
>> + if ( high && !vvtd->inv_queue_base )
>> + vvtd->inv_queue_base = map_guest_pages(vvtd->domain,
>> + PFN_DOWN(DMA_IQA_ADDR(new)),
>> + 1 << DMA_IQA_QS(new));
>
>Don't you need to pick a reference to this page(s)?
Mapping guest pages already get a reference, needn't get a reference
again.
>
>> + else if ( !high && vvtd->inv_queue_base )
>
>I'm not sure I follow the logic with high here.
Software can access 64-bit as either aligned quadwords or aligned
doublewords. Here we set up mapping when writting to the upper
double-word and destroy mapping when writting to the lower doubleword.
>
>> + {
>> + unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old));
>> + vvtd->inv_queue_base = NULL;
>> + }
>> +}
>> +
>> +static void vvtd_write_ics(struct vvtd *vvtd, uint32_t val)
>> +{
>> + if ( val & DMA_ICS_IWC )
>> + {
>> + vvtd_clear_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT);
>> + /* When IWC field is cleared, the IP field needs to be cleared */
>> + vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
>> + }
>> +}
>> +
>> static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val)
>> {
>> /* Writing a 1 means clear fault */
>> @@ -489,6 +682,20 @@ static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t
>> val)
>> return X86EMUL_OKAY;
>> }
>>
>> +static void vvtd_write_iectl(struct vvtd *vvtd, uint32_t val)
>> +{
>> + /* Only DMA_IECTL_IM bit is writable. Generate pending event when
>> unmask */
>> + if ( !(val & DMA_IECTL_IM) )
>> + {
>> + /* Clear IM and clear IP */
>> + vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
>> + if ( vvtd_test_and_clear_bit(vvtd, DMAR_IECTL_REG,
>> DMA_IECTL_IP_SHIFT) )
>> + vvtd_notify_inv_completion(vvtd);
>> + }
>> + else
>> + vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
>> +}
>> +
>> static void vvtd_write_fectl(struct vvtd *vvtd, uint32_t val)
>> {
>> /*
>> @@ -681,6 +888,48 @@ static void vvtd_write_fault_regs(struct vvtd *vvtd,
>> unsigned long val,
>> spin_unlock(&vvtd->fe_lock);
>> }
>>
>> +static void vvtd_write_invalidation_regs(struct vvtd *vvtd, unsigned long
>> val,
>> + unsigned int offset, unsigned int
>> len)
>> +{
>> + spin_lock(&vvtd->ie_lock);
>> + for ( ; len ; len -= 4, offset += 4, val = val >> 32)
>
>Same comment as in the previous patch, I don't really like the for
>loop, but I guess 64bit access must be allowed to these grup of
>registers?
Yes. It is allowed. But VT-d SPEC implies that hardware may disassemble
64bit write to two 32bit write.
>
>> + {
>> + switch ( offset )
>> + {
>> + case DMAR_IECTL_REG:
>> + vvtd_write_iectl(vvtd, val);
>> + break;
>> +
>> + case DMAR_ICS_REG:
>> + vvtd_write_ics(vvtd, val);
>> + break;
>> +
>> + case DMAR_IQT_REG:
>> + vvtd_write_iqt(vvtd, val);
>> + break;
>> +
>> + case DMAR_IQA_REG:
>> + vvtd_write_iqa(vvtd, val, 0);
>> + break;
>> +
>> + case DMAR_IQUA_REG:
>> + vvtd_write_iqa(vvtd, val, 1);
>> + break;
>> +
>> + case DMAR_IEDATA_REG:
>> + case DMAR_IEADDR_REG:
>> + case DMAR_IEUADDR_REG:
>> + vvtd_set_reg(vvtd, offset, val);
>> + break;
>> +
>> + default:
>> + break;
>> + }
>> + }
>> + spin_unlock(&vvtd->ie_lock);
>> +
>> +}
>> +
>> static int vvtd_write(struct vcpu *v, unsigned long addr,
>> unsigned int len, unsigned long val)
>> {
>> @@ -719,6 +968,17 @@ static int vvtd_write(struct vcpu *v, unsigned long
>> addr,
>> vvtd_write_fault_regs(vvtd, val, offset, len);
>> break;
>>
>> + case DMAR_IECTL_REG:
>> + case DMAR_ICS_REG:
>> + case DMAR_IQT_REG:
>> + case DMAR_IQA_REG:
>> + case DMAR_IQUA_REG:
>> + case DMAR_IEDATA_REG:
>> + case DMAR_IEADDR_REG:
>> + case DMAR_IEUADDR_REG:
>> + vvtd_write_invalidation_regs(vvtd, val, offset, len);
>> + break;
>> +
>> default:
>> if ( (offset == (fault_offset + DMA_FRCD2_OFFSET)) ||
>> (offset == (fault_offset + DMA_FRCD3_OFFSET)) )
>> @@ -840,7 +1100,8 @@ static int vvtd_handle_irq_request(const struct domain
>> *d,
>> irte.remap.tm);
>>
>> out:
>> - atomic_dec(&vvtd->inflight_intr);
>> + if ( !atomic_dec_and_test(&vvtd->inflight_intr) )
>> + vvtd_process_iq(vvtd);
>> return ret;
>> }
>>
>> @@ -911,6 +1172,7 @@ static int vvtd_create(struct domain *d, struct viommu
>> *viommu)
>> vvtd->domain = d;
>> register_mmio_handler(d, &vvtd_mmio_ops);
>> spin_lock_init(&vvtd->fe_lock);
>> + spin_lock_init(&vvtd->ie_lock);
>>
>> viommu->priv = vvtd;
>>
>> @@ -930,6 +1192,13 @@ static int vvtd_destroy(struct viommu *viommu)
>> sizeof(struct iremap_entry)));
>> vvtd->irt_base = NULL;
>> }
>> + if ( vvtd->inv_queue_base )
>> + {
>> + uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
>> +
>> + unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old));
>
>Don't you also need to unmap this page(s) when QIE is disabled?
Will do.
Thanks
Chao
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |