[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 16/28] x86/vvtd: Add queued invalidation (QI) support



On Fri, Nov 17, 2017 at 02:22:23PM +0800, Chao Gao wrote:
> Queued Invalidation Interface is an expanded invalidation interface with
> extended capabilities. Hardware implementations report support for queued
> invalidation interface through the Extended Capability Register. The queued
> invalidation interface uses an Invalidation Queue (IQ), which is a circular
> buffer in system memory. Software submits commands by writing Invalidation
> Descriptors to the IQ.
> 
> In this patch, a new function viommu_process_iq() is used for emulating how
> hardware handles invalidation requests through QI.

You should mention that QI is mandatory in order to support interrupt
remapping.

I was about to ask whether QI could be deferred to a later stage, but
AFAICT this is not an option.

> Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx>
> Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
> 
> ---
> v4:
>  - Introduce a lock to protect invalidation related registers.
> ---
>  xen/drivers/passthrough/vtd/iommu.h |  24 +++-
>  xen/drivers/passthrough/vtd/vvtd.c  | 271 
> +++++++++++++++++++++++++++++++++++-
>  2 files changed, 293 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/drivers/passthrough/vtd/iommu.h 
> b/xen/drivers/passthrough/vtd/iommu.h
> index b71dab8..de9188b 100644
> --- a/xen/drivers/passthrough/vtd/iommu.h
> +++ b/xen/drivers/passthrough/vtd/iommu.h
> @@ -47,7 +47,12 @@
>  #define DMAR_IQH_REG            0x80 /* invalidation queue head */
>  #define DMAR_IQT_REG            0x88 /* invalidation queue tail */
>  #define DMAR_IQA_REG            0x90 /* invalidation queue addr */
> +#define DMAR_IQUA_REG           0x94 /* invalidation queue upper addr */
> +#define DMAR_ICS_REG            0x9c /* invalidation completion status */
>  #define DMAR_IECTL_REG          0xa0 /* invalidation event control register 
> */
> +#define DMAR_IEDATA_REG         0xa4 /* invalidation event data register */
> +#define DMAR_IEADDR_REG         0xa8 /* invalidation event address register 
> */
> +#define DMAR_IEUADDR_REG        0xac /* upper address register */
>  #define DMAR_IRTA_REG           0xb8 /* base address of intr remap table */
>  #define DMAR_IRTUA_REG          0xbc /* upper address of intr remap table */
>  
> @@ -175,6 +180,21 @@
>  #define DMA_IRTA_S(val)         (val & 0xf)
>  #define DMA_IRTA_SIZE(val)      (1UL << (DMA_IRTA_S(val) + 1))
>  
> +/* IQA_REG */
> +#define DMA_IQA_ADDR(val)       (val & ~0xfffULL)
> +#define DMA_IQA_QS(val)         (val & 0x7)
> +#define DMA_IQA_RSVD            0xff8ULL
> +
> +/* IECTL_REG */
> +#define DMA_IECTL_IM_SHIFT 31
> +#define DMA_IECTL_IM            (1U << DMA_IECTL_IM_SHIFT)
> +#define DMA_IECTL_IP_SHIFT 30
> +#define DMA_IECTL_IP            (1U << DMA_IECTL_IP_SHIFT)
> +
> +/* ICS_REG */
> +#define DMA_ICS_IWC_SHIFT       0
> +#define DMA_ICS_IWC             (1U << DMA_ICS_IWC_SHIFT)
> +
>  /* PMEN_REG */
>  #define DMA_PMEN_EPM    (((u32)1) << 31)
>  #define DMA_PMEN_PRS    (((u32)1) << 0)
> @@ -205,13 +225,14 @@
>  /* FSTS_REG */
>  #define DMA_FSTS_PFO_SHIFT  0
>  #define DMA_FSTS_PPF_SHIFT  1
> +#define DMA_FSTS_IQE_SHIFT  4
>  #define DMA_FSTS_PRO_SHIFT  7
>  
>  #define DMA_FSTS_PFO        ((uint32_t)1 << DMA_FSTS_PFO_SHIFT)
>  #define DMA_FSTS_PPF        ((uint32_t)1 << DMA_FSTS_PPF_SHIFT)
>  #define DMA_FSTS_AFO        ((uint32_t)1 << 2)
>  #define DMA_FSTS_APF        ((uint32_t)1 << 3)
> -#define DMA_FSTS_IQE        ((uint32_t)1 << 4)
> +#define DMA_FSTS_IQE        ((uint32_t)1 << DMA_FSTS_IQE_SHIFT)
>  #define DMA_FSTS_ICE        ((uint32_t)1 << 5)
>  #define DMA_FSTS_ITE        ((uint32_t)1 << 6)
>  #define DMA_FSTS_PRO        ((uint32_t)1 << DMA_FSTS_PRO_SHIFT)
> @@ -555,6 +576,7 @@ struct qinval_entry {
>  
>  /* Queue invalidation head/tail shift */
>  #define QINVAL_INDEX_SHIFT 4
> +#define QINVAL_INDEX_MASK  0x7fff0ULL
>  
>  #define qinval_present(v) ((v).lo & 1)
>  #define qinval_fault_disable(v) (((v).lo >> 1) & 1)
> diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
> b/xen/drivers/passthrough/vtd/vvtd.c
> index a2fa64a..81170ec 100644
> --- a/xen/drivers/passthrough/vtd/vvtd.c
> +++ b/xen/drivers/passthrough/vtd/vvtd.c
> @@ -27,6 +27,7 @@
>  #include <asm/event.h>
>  #include <asm/io_apic.h>
>  #include <asm/hvm/domain.h>
> +#include <asm/hvm/support.h>
>  #include <asm/p2m.h>
>  
>  #include "iommu.h"
> @@ -68,6 +69,9 @@ struct vvtd {
>  
>      struct hvm_hw_vvtd hw;
>      void *irt_base;
> +    void *inv_queue_base;

Why not declare this as:

struct qinval_entry *

> +    /* This lock protects invalidation related registers */
> +    spinlock_t ie_lock;

As noted in another patch, I think the first approach should be to use
a single lock that serializes access to the whole vIOMMU register
space. Later we can see about more fine grained locking.

>  };
>  
>  /* Setting viommu_verbose enables debugging messages of vIOMMU */
> @@ -284,6 +288,12 @@ static void vvtd_notify_fault(const struct vvtd *vvtd)
>                              vvtd_get_reg(vvtd, DMAR_FEDATA_REG));
>  }
>  
> +static void vvtd_notify_inv_completion(const struct vvtd *vvtd)
> +{
> +    vvtd_generate_interrupt(vvtd, vvtd_get_reg_quad(vvtd, DMAR_IEADDR_REG),
> +                            vvtd_get_reg(vvtd, DMAR_IEDATA_REG));
> +}
> +
>  /* Computing the IRTE index for a given interrupt request. When success, 
> return
>   * 0 and set index to reference the corresponding IRTE. Otherwise, return < 
> 0,
>   * i.e. -1 when the irq request isn't an remapping format.
> @@ -478,6 +488,189 @@ static int vvtd_record_fault(struct vvtd *vvtd,
>      return X86EMUL_OKAY;
>  }
>  
> +/*
> + * Process an invalidation descriptor. Currently, only two types descriptors,
> + * Interrupt Entry Cache Invalidation Descritor and Invalidation Wait
> + * Descriptor are handled.
> + * @vvtd: the virtual vtd instance
> + * @i: the index of the invalidation descriptor to be processed
> + *
> + * If success return 0, or return non-zero when failure.
> + */
> +static int process_iqe(struct vvtd *vvtd, uint32_t i)
> +{
> +    struct qinval_entry qinval;
> +    int ret = 0;
> +
> +    if ( !vvtd->inv_queue_base )
> +    {
> +        gdprintk(XENLOG_ERR, "Invalidation queue base isn't set\n");
> +        return -1;

If you just return -1 or 0 please use bool instead. Or return proper
error codes.

> +    }
> +    qinval = ((struct qinval_entry *)vvtd->inv_queue_base)[i];

See my comment above regarding how inv_queue_base is declared, I'm not
sure why the copy is needed here.

> +
> +    switch ( qinval.q.inv_wait_dsc.lo.type )
> +    {
> +    case TYPE_INVAL_WAIT:
> +        if ( qinval.q.inv_wait_dsc.lo.sw )
> +        {
> +            uint32_t data = qinval.q.inv_wait_dsc.lo.sdata;
> +            uint64_t addr = qinval.q.inv_wait_dsc.hi.saddr << 2;
> +
> +            ret = hvm_copy_to_guest_phys(addr, &data, sizeof(data), current);
> +            if ( ret )
> +                vvtd_info("Failed to write status address\n");
> +        }
> +
> +        /*
> +         * The following code generates an invalidation completion event
> +         * indicating the invalidation wait descriptor completion. Note that
> +         * the following code fragment is not tested properly.
> +         */
> +        if ( qinval.q.inv_wait_dsc.lo.iflag )
> +        {
> +            if ( !vvtd_test_and_set_bit(vvtd, DMAR_ICS_REG, 
> DMA_ICS_IWC_SHIFT) )
> +            {
> +                vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
> +                if ( !vvtd_test_bit(vvtd, DMAR_IECTL_REG, 
> DMA_IECTL_IM_SHIFT) )
> +                {
> +                    vvtd_notify_inv_completion(vvtd);
> +                    vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
> +                }
> +            }
> +        }
> +        break;
> +
> +    case TYPE_INVAL_IEC:
> +        /* No cache is preserved in vvtd, nothing is needed to be flushed */
> +        break;
> +
> +    default:
> +        vvtd_debug("d%d: Invalidation type (%x) isn't supported\n",
> +                   vvtd->domain->domain_id, qinval.q.inv_wait_dsc.lo.type);
> +        return -1;
> +    }
> +
> +    return ret;
> +}
> +
> +/*
> + * Invalidate all the descriptors in Invalidation Queue.
> + */
> +static void vvtd_process_iq(struct vvtd *vvtd)
> +{
> +    uint32_t max_entry, i, iqh, iqt;
> +    int err = 0;
> +
> +    /* Trylock avoids more than 1 caller dealing with invalidation requests 
> */
> +    if ( !spin_trylock(&vvtd->ie_lock) )

Uh, is this correct? You are returning without the queue being
invalidated AFAICT.

> +        return;
> +
> +    iqh = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQH_REG), 
> QINVAL_INDEX_MASK);
> +    iqt = MASK_EXTR(vvtd_get_reg_quad(vvtd, DMAR_IQT_REG), 
> QINVAL_INDEX_MASK);
> +    /*
> +     * No new descriptor is fetched from the Invalidation Queue until
> +     * software clears the IQE field in the Fault Status Register
> +     */
> +    if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_IQE_SHIFT) )
> +    {
> +        spin_unlock(&vvtd->ie_lock);
> +        return;
> +    }
> +
> +    max_entry = 1 << (QINVAL_ENTRY_ORDER +
> +                      DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG)));
> +
> +    ASSERT(iqt < max_entry);

Is it possible for the user to write a valid value to DMAR_IQT_REG and
then change DMAR_IQA_REG in order to make the above ASSERT trigger?

> +    if ( iqh == iqt )
> +    {
> +        spin_unlock(&vvtd->ie_lock);
> +        return;
> +    }
> +
> +    for ( i = iqh; i != iqt; i = (i + 1) % max_entry )
> +    {
> +        err = process_iqe(vvtd, i);
> +        if ( err )
> +            break;
> +    }
> +
> +    /*
> +     * set IQH before checking error, because IQH should reference
> +     * the desriptor associated with the error when an error is seen
> +     * by guest
> +     */
> +    vvtd_set_reg_quad(vvtd, DMAR_IQH_REG, i << QINVAL_INDEX_SHIFT);
> +
> +    spin_unlock(&vvtd->ie_lock);
> +    if ( err )
> +    {
> +        spin_lock(&vvtd->fe_lock);
> +        vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_IQE_SHIFT);
> +        spin_unlock(&vvtd->fe_lock);
> +    }
> +}
> +
> +static void vvtd_write_iqt(struct vvtd *vvtd, uint32_t val)
> +{
> +    uint32_t max_entry;
> +
> +    if ( val & ~QINVAL_INDEX_MASK )
> +    {
> +        vvtd_info("attempts to set reserved bits in IQT\n");
> +        return;
> +    }
> +
> +    max_entry = 1U << (QINVAL_ENTRY_ORDER +
> +                       DMA_IQA_QS(vvtd_get_reg_quad(vvtd, DMAR_IQA_REG)));
> +    if ( MASK_EXTR(val, QINVAL_INDEX_MASK) >= max_entry )
> +    {
> +        vvtd_info("IQT: Value %x exceeded supported max index.", val);
> +        return;
> +    }
> +
> +    vvtd_set_reg(vvtd, DMAR_IQT_REG, val);
> +}
> +
> +static void vvtd_write_iqa(struct vvtd *vvtd, uint32_t val, bool high)
> +{
> +    uint64_t cap = vvtd_get_reg_quad(vvtd, DMAR_CAP_REG);
> +    uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
> +    uint64_t new;
> +
> +    if ( high )
> +        new = ((uint64_t)val << 32) | (old & 0xffffffff);
> +    else
> +        new = ((old >> 32) << 32) | val;

You can also use old & ~0xffffffffUL

> +
> +    if ( new & (~((1ULL << cap_mgaw(cap)) - 1) | DMA_IQA_RSVD) )
> +    {
> +        vvtd_info("Attempt to set reserved bits in IQA\n");
> +        return;
> +    }
> +
> +    vvtd_set_reg_quad(vvtd, DMAR_IQA_REG, new);
> +    if ( high && !vvtd->inv_queue_base )
> +        vvtd->inv_queue_base = map_guest_pages(vvtd->domain,
> +                                               PFN_DOWN(DMA_IQA_ADDR(new)),
> +                                               1 << DMA_IQA_QS(new));

Don't you need to pick a reference to this page(s)?

> +    else if ( !high && vvtd->inv_queue_base )

I'm not sure I follow the logic with high here.

> +    {
> +        unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old));
> +        vvtd->inv_queue_base = NULL;
> +    }
> +}
> +
> +static void vvtd_write_ics(struct vvtd *vvtd, uint32_t val)
> +{
> +    if ( val & DMA_ICS_IWC )
> +    {
> +        vvtd_clear_bit(vvtd, DMAR_ICS_REG, DMA_ICS_IWC_SHIFT);
> +        /* When IWC field is cleared, the IP field needs to be cleared */
> +        vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IP_SHIFT);
> +    }
> +}
> +
>  static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val)
>  {
>      /* Writing a 1 means clear fault */
> @@ -489,6 +682,20 @@ static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t 
> val)
>      return X86EMUL_OKAY;
>  }
>  
> +static void vvtd_write_iectl(struct vvtd *vvtd, uint32_t val)
> +{
> +    /* Only DMA_IECTL_IM bit is writable. Generate pending event when unmask 
> */
> +    if ( !(val & DMA_IECTL_IM) )
> +    {
> +        /* Clear IM and clear IP */
> +        vvtd_clear_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
> +        if ( vvtd_test_and_clear_bit(vvtd, DMAR_IECTL_REG, 
> DMA_IECTL_IP_SHIFT) )
> +            vvtd_notify_inv_completion(vvtd);
> +    }
> +    else
> +        vvtd_set_bit(vvtd, DMAR_IECTL_REG, DMA_IECTL_IM_SHIFT);
> +}
> +
>  static void vvtd_write_fectl(struct vvtd *vvtd, uint32_t val)
>  {
>      /*
> @@ -681,6 +888,48 @@ static void vvtd_write_fault_regs(struct vvtd *vvtd, 
> unsigned long val,
>      spin_unlock(&vvtd->fe_lock);
>  }
>  
> +static void vvtd_write_invalidation_regs(struct vvtd *vvtd, unsigned long 
> val,
> +                                         unsigned int offset, unsigned int 
> len)
> +{
> +    spin_lock(&vvtd->ie_lock);
> +    for ( ; len ; len -= 4, offset += 4, val = val >> 32)

Same comment as in the previous patch, I don't really like the for
loop, but I guess 64bit access must be allowed to these grup of
registers?

> +    {
> +        switch ( offset )
> +        {
> +        case DMAR_IECTL_REG:
> +            vvtd_write_iectl(vvtd, val);
> +            break;
> +
> +        case DMAR_ICS_REG:
> +            vvtd_write_ics(vvtd, val);
> +            break;
> +
> +        case DMAR_IQT_REG:
> +            vvtd_write_iqt(vvtd, val);
> +            break;
> +
> +        case DMAR_IQA_REG:
> +            vvtd_write_iqa(vvtd, val, 0);
> +            break;
> +
> +        case DMAR_IQUA_REG:
> +            vvtd_write_iqa(vvtd, val, 1);
> +            break;
> +
> +        case DMAR_IEDATA_REG:
> +        case DMAR_IEADDR_REG:
> +        case DMAR_IEUADDR_REG:
> +            vvtd_set_reg(vvtd, offset, val);
> +            break;
> +
> +        default:
> +            break;
> +        }
> +    }
> +    spin_unlock(&vvtd->ie_lock);
> +
> +}
> +
>  static int vvtd_write(struct vcpu *v, unsigned long addr,
>                        unsigned int len, unsigned long val)
>  {
> @@ -719,6 +968,17 @@ static int vvtd_write(struct vcpu *v, unsigned long addr,
>          vvtd_write_fault_regs(vvtd, val, offset, len);
>          break;
>  
> +    case DMAR_IECTL_REG:
> +    case DMAR_ICS_REG:
> +    case DMAR_IQT_REG:
> +    case DMAR_IQA_REG:
> +    case DMAR_IQUA_REG:
> +    case DMAR_IEDATA_REG:
> +    case DMAR_IEADDR_REG:
> +    case DMAR_IEUADDR_REG:
> +        vvtd_write_invalidation_regs(vvtd, val, offset, len);
> +        break;
> +
>      default:
>          if ( (offset == (fault_offset + DMA_FRCD2_OFFSET)) ||
>               (offset == (fault_offset + DMA_FRCD3_OFFSET)) )
> @@ -840,7 +1100,8 @@ static int vvtd_handle_irq_request(const struct domain 
> *d,
>                          irte.remap.tm);
>  
>   out:
> -    atomic_dec(&vvtd->inflight_intr);
> +    if ( !atomic_dec_and_test(&vvtd->inflight_intr) )
> +        vvtd_process_iq(vvtd);
>      return ret;
>  }
>  
> @@ -911,6 +1172,7 @@ static int vvtd_create(struct domain *d, struct viommu 
> *viommu)
>      vvtd->domain = d;
>      register_mmio_handler(d, &vvtd_mmio_ops);
>      spin_lock_init(&vvtd->fe_lock);
> +    spin_lock_init(&vvtd->ie_lock);
>  
>      viommu->priv = vvtd;
>  
> @@ -930,6 +1192,13 @@ static int vvtd_destroy(struct viommu *viommu)
>                                       sizeof(struct iremap_entry)));
>              vvtd->irt_base = NULL;
>          }
> +        if ( vvtd->inv_queue_base )
> +        {
> +            uint64_t old = vvtd_get_reg_quad(vvtd, DMAR_IQA_REG);
> +
> +            unmap_guest_pages(vvtd->inv_queue_base, 1 << DMA_IQA_QS(old));

Don't you also need to unmap this page(s) when QIE is disabled?

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.