[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 9/9] vpci/msix: add MSI-X handlers



>>> Roger Pau Monne <roger.pau@xxxxxxxxxx> 06/30/17 5:01 PM >>>
>Note that accesses to the Table Offset, Table BIR, PBA Offset and PBA
>BIR are not trapped by Xen at the moment.

They're mandated r/o by the spec anyway.

>@@ -113,6 +148,35 @@ static int vpci_modify_bar(struct domain *d, const struct 
>vpci_bar *bar,
>if ( IS_ERR(mem) )
>return -PTR_ERR(mem);
 >
>+    /*
>+     * Make sure the MSI-X regions of the BAR are not mapped into the domain
>+     * p2m, or else the MSI-X handlers are useless. Only do this when mapping,
>+     * since that's when the memory decoding on the device is enabled.
>+     */
>+    for ( i = 0; map && i < ARRAY_SIZE(bar->msix); i++ )
>+    {
>+        struct vpci_msix_mem *msix = bar->msix[i];
>+
>+        if ( !msix || msix->addr == INVALID_PADDR )
>+            continue;
>+
>+        rc = vpci_unmap_msix(d, msix);

Why do you need this, instead of being able to simply rely on the rangeset
based (un)mapping?

>@@ -405,7 +475,20 @@ static int vpci_init_bars(struct pci_dev *pdev)
>continue;
>}
 >
>-        bars[i].addr = (cmd & PCI_COMMAND_MEMORY) ? addr : INVALID_PADDR;
>+        if ( cmd & PCI_COMMAND_MEMORY )
>+        {
>+            unsigned int j;
>+
>+            bars[i].addr = addr;
>+
>+            for ( j = 0; j < ARRAY_SIZE(bars[i].msix); j++ )
>+                if ( bars[i].msix[j] )
>+                    bars[i].msix[j]->addr = bars[i].addr +
>+                                            bars[i].msix[j]->offset;
>+        }
>+        else
>+            bars[i].addr = INVALID_PADDR;

As (I think) mentioned elsewhere already, this init-time special case looks
dangerous (and unnecessary) to me (or else I'd expect you to also zap
the field when the memory decode bit is being cleared).

>--- /dev/null
>+++ b/xen/drivers/vpci/msix.c
>@@ -0,0 +1,503 @@
>+/*
>+ * Handlers for accesses to the MSI-X capability structure and the memory
>+ * region.
>+ *
>+ * Copyright (C) 2017 Citrix Systems R&D
>+ *
>+ * This program is free software; you can redistribute it and/or
>+ * modify it under the terms and conditions of the GNU General Public
>+ * License, version 2, as published by the Free Software Foundation.
>+ *
>+ * This program is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU General Public
>+ * License along with this program; If not, see 
><http://www.gnu.org/licenses/>.
>+ */
>+
>+#include <xen/sched.h>
>+#include <xen/vpci.h>
>+#include <asm/msi.h>
>+#include <xen/p2m-common.h>
>+#include <xen/keyhandler.h>
>+
>+#define MSIX_SIZE(num) (offsetof(struct vpci_msix, entries[num]))

The outermost parens are pointless here.

>+static void vpci_msix_control_write(struct pci_dev *pdev, unsigned int reg,
>+                                    union vpci_val val, void *data)
>+{
>+    uint8_t seg = pdev->seg, bus = pdev->bus;
>+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
>+    struct vpci_msix *msix = data;
>+    paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr;
>+    bool new_masked, new_enabled;
>+    unsigned int i;
>+    int rc;
>+
>+    new_masked = val.u16 & PCI_MSIX_FLAGS_MASKALL;
>+    new_enabled = val.u16 & PCI_MSIX_FLAGS_ENABLE;
>+
>+    if ( !msix->enabled && new_enabled )
>+    {
>+        /* MSI-X enabled. */

Insert "being"?

>+        for ( i = 0; i < msix->max_entries; i++ )
>+        {
>+            if ( msix->entries[i].masked )
>+                continue;

Why is the mask bit relevant here, but not the mask-all one?

>+            rc = vpci_msix_arch_enable(&msix->entries[i].arch, pdev,
>+                                       msix->entries[i].addr,
>+                                       msix->entries[i].data,
>+                                       msix->entries[i].nr, table_base);
>+            if ( rc )
>+            {
>+                gdprintk(XENLOG_ERR,
<+                         "%04x:%02x:%02x.%u: unable to update entry %u: %d\n",
>+                         seg, bus, slot, func, i, rc);
>+                return;
>+            }
>+
>+            vpci_msix_arch_mask(&msix->entries[i].arch, pdev, false);

Same question here.

>+        }
>+    }
>+    else if ( msix->enabled && !new_enabled )
>+    {
>+        /* MSI-X disabled. */
>+        for ( i = 0; i < msix->max_entries; i++ )
>+        {
>+            rc = vpci_msix_arch_disable(&msix->entries[i].arch, pdev);
>+            if ( rc )
>+            {
>+                gdprintk(XENLOG_ERR,
>+                         "%04x:%02x:%02x.%u: unable to disable entry %u: 
>%d\n",
>+                         seg, bus, slot, func, i, rc);
>+                return;
>+            }
>+        }
>+    }
>+
>+    if ( (new_enabled != msix->enabled || new_masked != msix->masked) &&
>+         pci_msi_conf_write_intercept(pdev, reg, 2, &val.u32) >= 0 )
>+        pci_conf_write16(seg, bus, slot, func, reg, val.u32);

DYM val.u16 here?

>+static struct vpci_msix *vpci_msix_find(struct domain *d, unsigned long addr)
>+{
>+    struct vpci_msix *msix;
>+
>+    ASSERT(vpci_locked(d));
>+    list_for_each_entry ( msix,  &d->arch.hvm_domain.msix_tables, next )
>+    {
>+        uint8_t seg = msix->pdev->seg, bus = msix->pdev->bus;
>+        uint8_t slot = PCI_SLOT(msix->pdev->devfn);
>+        uint8_t func = PCI_FUNC(msix->pdev->devfn);
>+        uint16_t cmd = pci_conf_read16(seg, bus, slot, func, PCI_COMMAND);

Perhaps better to keep a cached copy of the command register value?

>+static int vpci_msix_read(struct vcpu *v, unsigned long addr,
>+                          unsigned int len, unsigned long *data)
>+{
>+    struct domain *d = v->domain;
>+    struct vpci_msix *msix;
>+    const struct vpci_msix_entry *entry;
>+    unsigned int offset;
>+
>+    vpci_lock(d);
>+
>+    msix = vpci_msix_find(d, addr);
>+    if ( !msix )
>+    {
>+        vpci_unlock(d);
>+        *data = ~0ul;
>+        return X86EMUL_OKAY;
>+    }
>+
>+    if ( vpci_msix_access_check(msix->pdev, addr, len) )
>+    {
>+        vpci_unlock(d);
>+        *data = ~0ul;
>+        return X86EMUL_OKAY;
>+    }
>+
>+    if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) )
>+    {
>+        /* Access to PBA. */
>+        switch ( len )
>+        {
>+        case 4:
>+            *data = readl(addr);
>+            break;
>+        case 8:
>+            *data = readq(addr);
>+            break;
>+        default:
>+            ASSERT_UNREACHABLE();

data = ~0ul;

>+static int vpci_msix_write(struct vcpu *v, unsigned long addr,
>+                                 unsigned int len, unsigned long data)
>+{
>+    struct domain *d = v->domain;
>+    struct vpci_msix *msix;
>+    struct vpci_msix_entry *entry;
>+    unsigned int offset;
>+
>+    vpci_lock(d);
>+    msix = vpci_msix_find(d, addr);
>+    if ( !msix )
>+    {
>+        vpci_unlock(d);
>+        return X86EMUL_OKAY;
>+    }
>+
>+    if ( MSIX_ADDR_IN_RANGE(addr, &msix->pba) )
>+    {
>+        /* Ignore writes to PBA, it's behavior is undefined. */
>+        vpci_unlock(d);
>+        return X86EMUL_OKAY;
>+    }
>+
>+    if ( vpci_msix_access_check(msix->pdev, addr, len) )
>+    {
>+        vpci_unlock(d);
>+        return X86EMUL_OKAY;
>+    }
>+
>+    /* Get the table entry and offset. */
>+    entry = vpci_msix_get_entry(msix, addr);
>+    offset = addr & (PCI_MSIX_ENTRY_SIZE - 1);
>+
>+    switch ( offset )
>+    {
>+    case PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET:
>+        if ( len == 8 )
>+        {
>+            entry->addr = data;
>+            break;
>+        }
>+        entry->addr &= ~0xffffffff;

With this, why not ...

>+        entry->addr |= data;
>+        break;
>+    case PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET:
>+        entry->addr &= ~((uint64_t)0xffffffff << 32);

... simply 0xffffffff here?

>+        entry->addr |= data << 32;

Iirc we've already talked about this being undefined for 32-bit arches (e.g.
ARM32), and the resulting need to make "data" uint64_t.

>+        break;
>+    case PCI_MSIX_ENTRY_DATA_OFFSET:
>+        /*
>+         * 8 byte writes to the msg data and vector control fields are
>+         * only allowed if the entry is masked.
>+         */
>+        if ( len == 8 && !entry->masked && !msix->masked && msix->enabled )
>+        {
>+            vpci_unlock(d);
>+            return X86EMUL_OKAY;
>+        }

I don't think this is correct - iirc such writes simply don't take effect 
immediately
(but I then seem to recall this to apply to the address field and 32-bit writes 
to
the data field as well). They'd instead take effect the next time the entry is 
being
unmasked (or some such). A while ago I did fix the qemu code to behave in this
way.

>+        entry->data = data;
>+
>+        if ( len == 4 )
>+            break;
>+
>+        data >>= 32;
>+        /* fallthrough */
>+    case PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET:
>+    {
>+        bool new_masked = data & PCI_MSIX_VECTOR_BITMASK;
>+        struct pci_dev *pdev = msix->pdev;
>+        paddr_t table_base = pdev->vpci->header.bars[msix->table.bir].addr;
>+        int rc;
>+
>+        if ( !msix->enabled )
>+        {
>+            entry->masked = new_masked;
>+            break;
>+        }
>+
>+        if ( new_masked != entry->masked && !new_masked )

if ( !new_masked && entry->masked )

(or the other way around)

>+static int vpci_init_msix(struct pci_dev *pdev)
>+{
>+    struct domain *d = pdev->domain;
>+    uint8_t seg = pdev->seg, bus = pdev->bus;
>+    uint8_t slot = PCI_SLOT(pdev->devfn), func = PCI_FUNC(pdev->devfn);
>+    struct vpci_msix *msix;
>+    unsigned int msix_offset, i, max_entries;
>+    struct vpci_bar *table_bar, *pba_bar;
>+    uint16_t control;
>+    int rc;
>+
>+    msix_offset = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
>+    if ( !msix_offset )
>+        return 0;
>+
>+    control = pci_conf_read16(seg, bus, slot, func,
>+                              msix_control_reg(msix_offset));
>+
>+    /* Get the maximum number of vectors the device supports. */
>+    max_entries = msix_table_size(control);
>+
>+    msix = xzalloc_bytes(MSIX_SIZE(max_entries));
>+    if ( !msix )
>+        return -ENOMEM;
>+
>+    msix->max_entries = max_entries;
>+    msix->pdev = pdev;
>+
>+    /* Find the MSI-X table address. */
>+    msix->table.offset = pci_conf_read32(seg, bus, slot, func,
>+                                         msix_table_offset_reg(msix_offset));
>+    msix->table.bir = msix->table.offset & PCI_MSIX_BIRMASK;
>+    msix->table.offset &= ~PCI_MSIX_BIRMASK;
>+    msix->table.size = msix->max_entries * PCI_MSIX_ENTRY_SIZE;
>+    msix->table.addr = INVALID_PADDR;
>+
>+    /* Find the MSI-X pba address. */
>+    msix->pba.offset = pci_conf_read32(seg, bus, slot, func,
>+                                       msix_pba_offset_reg(msix_offset));
>+    msix->pba.bir = msix->pba.offset & PCI_MSIX_BIRMASK;
>+    msix->pba.offset &= ~PCI_MSIX_BIRMASK;
>+    msix->pba.size = DIV_ROUND_UP(msix->max_entries, 8);

I think you want to round up to at least the next 32-bit boundary; the
spec talking about bits 63..00 even suggests a 64-bit boundary. The
table addresses being required to be qword aligned also supports this.

>+void vpci_dump_msix(void)
>+{
>+    struct domain *d;
>+    struct pci_dev *pdev;

const for all pointers in dump handlers, as far as possible.

>+    for_each_domain ( d )
>+    {
>+        if ( !has_vpci(d) )
>+            continue;
>+
>+        printk("vPCI MSI-X information for guest %u\n", d->domain_id);

Wouldn't it be better (more useful) to dump the MSI and MSI-X data for a
domain next to each other?

Apart from the comments here the ones give for the MSI patch apply
respectively.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.