[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 16/28] x86/vvtd: Add queued invalidation (QI) support



On Mon, Feb 12, 2018 at 02:36:10PM +0000, Roger Pau Monné wrote:
>On Fri, Nov 17, 2017 at 02:22:23PM +0800, Chao Gao wrote:
>> Queued Invalidation Interface is an expanded invalidation interface with
>> extended capabilities. Hardware implementations report support for queued
>> invalidation interface through the Extended Capability Register. The queued
>> invalidation interface uses an Invalidation Queue (IQ), which is a circular
>> buffer in system memory. Software submits commands by writing Invalidation
>> Descriptors to the IQ.
>> 
>> In this patch, a new function viommu_process_iq() is used for emulating how
>> hardware handles invalidation requests through QI.
>
>You should mention that QI is mandatory in order to support interrupt
>remapping.

Will do.

>
>I was about to ask whether QI could be deferred to a later stage, but
>AFAICT this is not an option.
>
>> Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx>
>> Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
>> 
>> ---
>> v4:
>>  - Introduce a lock to protect invalidation related registers.
>> ---
>>  xen/drivers/passthrough/vtd/iommu.h |  24 +++-
>>  xen/drivers/passthrough/vtd/vvtd.c  | 271 
>> +++++++++++++++++++++++++++++++++++-
>>  2 files changed, 293 insertions(+), 2 deletions(-)
>> 
>> diff --git a/xen/drivers/passthrough/vtd/iommu.h 
>> b/xen/drivers/passthrough/vtd/iommu.h
>> index b71dab8..de9188b 100644
>> --- a/xen/drivers/passthrough/vtd/iommu.h
>> +++ b/xen/drivers/passthrough/vtd/iommu.h
>> @@ -47,7 +47,12 @@
>>  #define DMAR_IQH_REG            0x80 /* invalidation queue head */
>>  #define DMAR_IQT_REG            0x88 /* invalidation queue tail */
>>  #define DMAR_IQA_REG            0x90 /* invalidation queue addr */
>> +#define DMAR_IQUA_REG           0x94 /* invalidation queue upper addr */
>> +#define DMAR_ICS_REG            0x9c /* invalidation completion status */
>>  #define DMAR_IECTL_REG          0xa0 /* invalidation event control register 
>> */
>> +#define DMAR_IEDATA_REG         0xa4 /* invalidation event data register */
>> +#define DMAR_IEADDR_REG         0xa8 /* invalidation event address register 
>> */
>> +#define DMAR_IEUADDR_REG        0xac /* upper address register */
>>  #define DMAR_IRTA_REG           0xb8 /* base address of intr remap table */
>>  #define DMAR_IRTUA_REG          0xbc /* upper address of intr remap table */
>>  
>> @@ -175,6 +180,21 @@
>>  #define DMA_IRTA_S(val)         (val & 0xf)
>>  #define DMA_IRTA_SIZE(val)      (1UL << (DMA_IRTA_S(val) + 1))
>>  
>> +/* IQA_REG */
>> +#define DMA_IQA_ADDR(val)       (val & ~0xfffULL)
>> +#define DMA_IQA_QS(val)         (val & 0x7)
>> +#define DMA_IQA_RSVD            0xff8ULL
>> +
>> +/* IECTL_REG */
>> +#define DMA_IECTL_IM_SHIFT 31
>> +#define DMA_IECTL_IM            (1U << DMA_IECTL_IM_SHIFT)
>> +#define DMA_IECTL_IP_SHIFT 30
>> +#define DMA_IECTL_IP            (1U << DMA_IECTL_IP_SHIFT)
>> +
>> +/* ICS_REG */
>> +#define DMA_ICS_IWC_SHIFT       0
>> +#define DMA_ICS_IWC             (1U << DMA_ICS_IWC_SHIFT)
>> +
>>  /* PMEN_REG */
>>  #define DMA_PMEN_EPM    (((u32)1) << 31)
>>  #define DMA_PMEN_PRS    (((u32)1) << 0)
>> @@ -205,13 +225,14 @@
>>  /* FSTS_REG */
>>  #define DMA_FSTS_PFO_SHIFT  0
>>  #define DMA_FSTS_PPF_SHIFT  1
>> +#define DMA_FSTS_IQE_SHIFT  4
>>  #define DMA_FSTS_PRO_SHIFT  7
>>  
>>  #define DMA_FSTS_PFO        ((uint32_t)1 << DMA_FSTS_PFO_SHIFT)
>>  #define DMA_FSTS_PPF        ((uint32_t)1 << DMA_FSTS_PPF_SHIFT)
>>  #define DMA_FSTS_AFO        ((uint32_t)1 << 2)
>>  #define DMA_FSTS_APF        ((uint32_t)1 << 3)
>> -#define DMA_FSTS_IQE        ((uint32_t)1 << 4)
>> +#define DMA_FSTS_IQE        ((uint32_t)1 << DMA_FSTS_IQE_SHIFT)
>>  #define DMA_FSTS_ICE        ((uint32_t)1 << 5)
>>  #define DMA_FSTS_ITE        ((uint32_t)1 << 6)
>>  #define DMA_FSTS_PRO        ((uint32_t)1 << DMA_FSTS_PRO_SHIFT)
>> @@ -555,6 +576,7 @@ struct qinval_entry {
>>  
>>  /* Queue invalidation head/tail shift */
>>  #define QINVAL_INDEX_SHIFT 4
>> +#define QINVAL_INDEX_MASK  0x7fff0ULL
>>  
>>  #define qinval_present(v) ((v).lo & 1)
>>  #define qinval_fault_disable(v) (((v).lo >> 1) & 1)
>> diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
>> b/xen/drivers/passthrough/vtd/vvtd.c
>> index a2fa64a..81170ec 100644
>> --- a/xen/drivers/passthrough/vtd/vvtd.c
>> +++ b/xen/drivers/passthrough/vtd/vvtd.c
>> @@ -27,6 +27,7 @@
>>  #include <asm/event.h>
>>  #include <asm/io_apic.h>
>>  #include <asm/hvm/domain.h>
>> +#include <asm/hvm/support.h>
>>  #include <asm/p2m.h>
>>  
>>  #include "iommu.h"
>> @@ -68,6 +69,9 @@ struct vvtd {
>>  
>>      struct hvm_hw_vvtd hw;
>>      void *irt_base;
>> +    void *inv_queue_base;
>
>Why not declare this as:
>
>struct qinval_entry *

will do.

>
>> +    /* This lock protects invalidation related registers */
>> +    spinlock_t ie_lock;
>
>As noted in another patch, I think the first approach should be to use
>a single lock that serializes access to the whole vIOMMU register
>space. Later we can see about more fine grained locking.

Seems you mean a coars-grained lock should be taken at the beginning of 
operations
which read and write vIOMMU registers. It is supposed to be easy to add such 
lock.

>
>>  };
>>  
>>  /* Setting viommu_verbose enables debugging messages of vIOMMU */
>> @@ -284,6 +288,12 @@ static void vvtd_notify_fault(const struct vvtd *vvtd)
>>                              vvtd_get_reg(vvtd, DMAR_FEDATA_REG));
>>  }
>>  
>> +static void vvtd_notify_inv_completion(const struct vvtd *vvtd)
>> +{
>> +    vvtd_generate_interrupt(vvtd, vvtd_get_reg_quad(vvtd, DMAR_IEADDR_REG),
>> +                            vvtd_get_reg(vvtd, DMAR_IEDATA_REG));
>> +}
>> +
>>  /* Computing the IRTE index for a given interrupt request. When success, 
>> return
>>   * 0 and set index to reference the corresponding IRTE. Otherwise, return < 
>> 0,
>>   * i.e. -1 when the irq request isn't an remapping format.
>> @@ -478,6 +488,189 @@ static int vvtd_record_fault(struct vvtd *vvtd,
>>      return X86EMUL_OKAY;
>>  }
>>  
>> +/*
>> + * Process an invalidation descriptor. Currently, only two types 
>> descriptors,
>> + * Interrupt Entry Cache Invalidation Descritor and Invalidation Wait
>> + * Descriptor are handled.
>> + * @vvtd: the virtual vtd instance
>> + * @i: the index of the invalidation descriptor to be processed
>> + *
>> + * If success return 0, or return non-zero when failure.
>> + */
>> +static int process_iqe(struct vvtd *vvtd, uint32_t i)
>> +{
>> +    struct qinval_entry qinval;
>> +    int ret = 0;
>> +
>> +    if ( !vvtd->inv_queue_base )
>> +    {
>> +        gdprintk(XENLOG_ERR, "Invalidation queue base isn't set\n");
>> +        return -1;
>
>If you just return -1 or 0 please use bool instead. Or return proper
>error codes.

Will return meaningful error codes.

>
>> +    }
>> +    qinval = ((struct qinval_entry *)vvtd->inv_queue_base)[i];
>
>See my comment above regarding how inv_queue_base is declared, I'm not
>sure why the copy is needed here.

Don't need copy here. Will fix.

>
>> +
>> +    switch ( qinval.q.inv_wait_dsc.lo.type )
>> +    {
>> +    case TYPE_INVAL_WAIT:
>> +        if ( qinval.q.inv_wait_dsc.lo.sw )
>> +        {
>> +            uint32_t data = qinval.q.inv_wait_dsc.lo.sdata;
>> +            uint64_t addr = qinval.q.inv_wait_dsc.hi.saddr << 2;
>> +
>> +            ret = hvm_copy_to_guest_phys(addr, &data, sizeof(data), 
>> current);
>> +            if ( ret )
>> +                vvtd_info("Failed to write status address\n");
>> +        }
>> +
>> +        /*
>> +         * The following code generates an invalidation completion event
>> +         * indicating the invalidation wait descriptor completion. Note that
>> +         * the following code fragment is not tested properly.
>> +         */
>> +        if ( qinval.q.inv_wait_dsc.lo.iflag )
>> +        {
>> +            if ( !vvtd_test_and_set_bit(vvtd, DMAR_ICS_REG, 
>> DMA_ICS_IWC_SHIFT) )
>> +            {
>> +                vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
>> +                if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG, 
>> DMA_IECTL_IM_SHIFT) )
>> +                {
>> +                    vvtd_notify_inv_completion(vvtd);
>> +                    vvtd_clear_bit(vvtd, DMAR_IECTL_REG, 
>> DMA_IECTL_IP_SHIFT);
>> +                }
>> +            }
>> +        }
>> +        break;
>> +
>> +    case TYPE_INVAL_IEC:
>> +        /* No cache is preserved in vvtd, nothing is needed to be flushed */
>> +        break;
>> +
>> +    default:
>> +        vvtd_debug("d%d: Invalidation type (%x) isn't supported\n",
>> +                   vvtd->domain->domain_id, qinval.q.inv_wait_dsc.lo.type);
>> +        return -1;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +/*
>> + * Invalidate all the descriptors in Invalidation Queue.
>> + */
>> +static void vvtd_process_iq(struct vvtd *vvtd)
>> +{
>> +    uint32_t max_entry, i, iqh, iqt;
>> +    int err = 0;
>> +
>> +    /* Trylock avoids more than 1 caller dealing with invalidation requests 
>> */
>> +    if ( !spin_trylock(&vvtd->ie_lock) )
>
>Uh, is this correct? You are returning without the queue being
>invalidated AFAICT.

QI is an asynchronous operation. Software can queue a special
invalidation request to invalidation queue to get a notification from
hardware when all requests before that spcial request are finished.
Returing without invalidating queue is acceptable. Real invalidation is
deferred to a suitable time when no interrupt is processing, no one
is doing invalidation and guest isn't writing QI related registers.

Anyway, if we use a coars-grained lock, 'ie_lock' will be removed and then
invalidation will be done synchronously because there will be no in-flight
requests.

>
>> +        return;
>> +
>> +    iqh = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQH_REG), 
>> QINVAL_INDEX_MASK);
>> +    iqt = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQT_REG), 
>> QINVAL_INDEX_MASK);
>> +    /*
>> +     * No new descriptor is fetched from the Invalidation Queue until
>> +     * software clears the IQE field in the Fault Status Register
>> +     */
>> +    if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) )
>> +    {
>> +        spin_unlock(&vvtd->ie_lock);
>> +        return;
>> +    }
>> +
>> +    max_entry = 1 << (QINVAL_ENTRY_ORDER +
>> +                      DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG)));
>> +
>> +    ASSERT(iqt < max_entry);
>
>Is it possible for the user to write a valid value to DMAR_IQT_REG and
>then change DMAR_IQA_REG in order to make the above ASSERT trigger?

No. It isn't.

>
>> +    if ( iqh == iqt )
>> +    {
>> +        spin_unlock(&vvtd->ie_lock);
>> +        return;
>> +    }
>> +
>> +    for ( i = iqh; i != iqt; i = (i + 1) % max_entry )
>> +    {
>> +        err = process_iqe(vvtd, i);
>> +        if ( err )
>> +            break;
>> +    }
>> +
>> +    /*
>> +     * set IQH before checking error, because IQH should reference
>> +     * the desriptor associated with the error when an error is seen
>> +     * by guest
>> +     */
>> +    vvtd_set_reg_quad(vvtd, DMAR_IQH_REG, i << QINVAL_INDEX_SHIFT);
>> +
>> +    spin_unlock(&vvtd->ie_lock);
>> +    if ( err )
>> +    {
>> +        spin_lock(&vvtd->fe_lock);
>> +        vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_IQE_SHIFT);
>> +        spin_unlock(&vvtd->fe_lock);
>> +    }
>> +}
>> +
>> +static void vvtd_write_iqt(struct vvtd *vvtd, uint32_t val)
>> +{
>> +    uint32_t max_entry;
>> +
>> +    if ( val & ~QINVAL_INDEX_MASK )
>> +    {
>> +        vvtd_info("attempts to set reserved bits in IQT\n");
>> +        return;
>> +    }
>> +
>> +    max_entry = 1U << (QINVAL_ENTRY_ORDER +
>> +                       DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG)));
>> +    if ( MASK_EXTR(val, QINVAL_INDEX_MASK) >= max_entry )
>> +    {
>> +        vvtd_info("IQT: Value %x exceeded supported max index.", val);
>> +        return;
>> +    }
>> +
>> +    vvtd_set_reg(vvtd, DMAR_IQT_REG, val);
>> +}
>> +
>> +static void vvtd_write_iqa(struct vvtd *vvtd, uint32_t val, bool high)
>> +{
>> +    uint64_t cap = vvtd_get_reg_quad(vvtd, DMAR_CAP_REG);
>> +    uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
>> +    uint64_t new;
>> +
>> +    if ( high )
>> +        new = ((uint64_t)val << 32) | (old & 0xffffffff);
>> +    else
>> +        new = ((old >> 32) << 32) | val;
>
>You can also use old & ~0xffffffffUL
>
>> +
>> +    if ( new & (~((1ULL << cap_mgaw(cap)) - 1) | DMA_IQA_RSVD) )
>> +    {
>> +        vvtd_info("Attempt to set reserved bits in IQA\n");
>> +        return;
>> +    }
>> +
>> +    vvtd_set_reg_quad(vvtd, DMAR_IQA_REG, new);
>> +    if ( high && !vvtd->inv_queue_base )
>> +        vvtd->inv_queue_base = map_guest_pages(vvtd->domain,
>> +                                               PFN_DOWN(DMA_IQA_ADDR(new)),
>> +                                               1 << DMA_IQA_QS(new));
>
>Don't you need to pick a reference to this page(s)?

Mapping guest pages already get a reference, needn't get a reference
again.

>
>> +    else if ( !high && vvtd->inv_queue_base )
>
>I'm not sure I follow the logic with high here.

Software can access 64-bit as either aligned quadwords or aligned
doublewords. Here we set up mapping when writting to the upper
double-word and destroy mapping when writting to the lower doubleword.

>
>> +    {
>> +        unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old));
>> +        vvtd->inv_queue_base = NULL;
>> +    }
>> +}
>> +
>> +static void vvtd_write_ics(struct vvtd *vvtd, uint32_t val)
>> +{
>> +    if ( val & DMA_ICS_IWC )
>> +    {
>> +        vvtd_clear_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT);
>> +        /* When IWC field is cleared, the IP field needs to be cleared */
>> +        vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
>> +    }
>> +}
>> +
>>  static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val)
>>  {
>>      /* Writing a 1 means clear fault */
>> @@ -489,6 +682,20 @@ static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t 
>> val)
>>      return X86EMUL_OKAY;
>>  }
>>  
>> +static void vvtd_write_iectl(struct vvtd *vvtd, uint32_t val)
>> +{
>> +    /* Only DMA_IECTL_IM bit is writable. Generate pending event when 
>> unmask */
>> +    if ( !(val & DMA_IECTL_IM) )
>> +    {
>> +        /* Clear IM and clear IP */
>> +        vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
>> +        if ( vvtd_test_and_clear_bit(vvtd, DMAR_IECTL_REG, 
>> DMA_IECTL_IP_SHIFT) )
>> +            vvtd_notify_inv_completion(vvtd);
>> +    }
>> +    else
>> +        vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
>> +}
>> +
>>  static void vvtd_write_fectl(struct vvtd *vvtd, uint32_t val)
>>  {
>>      /*
>> @@ -681,6 +888,48 @@ static void vvtd_write_fault_regs(struct vvtd *vvtd, 
>> unsigned long val,
>>      spin_unlock(&vvtd->fe_lock);
>>  }
>>  
>> +static void vvtd_write_invalidation_regs(struct vvtd *vvtd, unsigned long 
>> val,
>> +                                         unsigned int offset, unsigned int 
>> len)
>> +{
>> +    spin_lock(&vvtd->ie_lock);
>> +    for ( ; len ; len -= 4, offset += 4, val = val >> 32)
>
>Same comment as in the previous patch, I don't really like the for
>loop, but I guess 64bit access must be allowed to these grup of
>registers?

Yes. It is allowed. But VT-d SPEC implies that hardware may disassemble
64bit write to two 32bit write.

>
>> +    {
>> +        switch ( offset )
>> +        {
>> +        case DMAR_IECTL_REG:
>> +            vvtd_write_iectl(vvtd, val);
>> +            break;
>> +
>> +        case DMAR_ICS_REG:
>> +            vvtd_write_ics(vvtd, val);
>> +            break;
>> +
>> +        case DMAR_IQT_REG:
>> +            vvtd_write_iqt(vvtd, val);
>> +            break;
>> +
>> +        case DMAR_IQA_REG:
>> +            vvtd_write_iqa(vvtd, val, 0);
>> +            break;
>> +
>> +        case DMAR_IQUA_REG:
>> +            vvtd_write_iqa(vvtd, val, 1);
>> +            break;
>> +
>> +        case DMAR_IEDATA_REG:
>> +        case DMAR_IEADDR_REG:
>> +        case DMAR_IEUADDR_REG:
>> +            vvtd_set_reg(vvtd, offset, val);
>> +            break;
>> +
>> +        default:
>> +            break;
>> +        }
>> +    }
>> +    spin_unlock(&vvtd->ie_lock);
>> +
>> +}
>> +
>>  static int vvtd_write(struct vcpu *v, unsigned long addr,
>>                        unsigned int len, unsigned long val)
>>  {
>> @@ -719,6 +968,17 @@ static int vvtd_write(struct vcpu *v, unsigned long 
>> addr,
>>          vvtd_write_fault_regs(vvtd, val, offset, len);
>>          break;
>>  
>> +    case DMAR_IECTL_REG:
>> +    case DMAR_ICS_REG:
>> +    case DMAR_IQT_REG:
>> +    case DMAR_IQA_REG:
>> +    case DMAR_IQUA_REG:
>> +    case DMAR_IEDATA_REG:
>> +    case DMAR_IEADDR_REG:
>> +    case DMAR_IEUADDR_REG:
>> +        vvtd_write_invalidation_regs(vvtd, val, offset, len);
>> +        break;
>> +
>>      default:
>>          if ( (offset == (fault_offset + DMA_FRCD2_OFFSET)) ||
>>               (offset == (fault_offset + DMA_FRCD3_OFFSET)) )
>> @@ -840,7 +1100,8 @@ static int vvtd_handle_irq_request(const struct domain 
>> *d,
>>                          irte.remap.tm);
>>  
>>   out:
>> -    atomic_dec(&vvtd->inflight_intr);
>> +    if ( !atomic_dec_and_test(&vvtd->inflight_intr) )
>> +        vvtd_process_iq(vvtd);
>>      return ret;
>>  }
>>  
>> @@ -911,6 +1172,7 @@ static int vvtd_create(struct domain *d, struct viommu 
>> *viommu)
>>      vvtd->domain = d;
>>      register_mmio_handler(d, &vvtd_mmio_ops);
>>      spin_lock_init(&vvtd->fe_lock);
>> +    spin_lock_init(&vvtd->ie_lock);
>>  
>>      viommu->priv = vvtd;
>>  
>> @@ -930,6 +1192,13 @@ static int vvtd_destroy(struct viommu *viommu)
>>                                       sizeof(struct iremap_entry)));
>>              vvtd->irt_base = NULL;
>>          }
>> +        if ( vvtd->inv_queue_base )
>> +        {
>> +            uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
>> +
>> +            unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old));
>
>Don't you also need to unmap this page(s) when QIE is disabled?

Will do.

Thanks
Chao

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.