[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 10/10] vpci/sriov: add support for SR-IOV capability



So that a PCI device that supports SR-IOV (PF) can enable the capability
and use the virtual functions.

This code is expected to only be used by privileged domains,
unprivileged domains should not get access to the SR-IOV capability.

The current code detects enabling of the virtual functions feature and
automatically adds the VFs to the domain. It also detects enabling of
memory space and maps the VFs BARs into the domain p2m. Disabling of
the VF enable bit removes the devices and the BAR memory map from the
domain p2m.

Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
---
Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>
Cc: Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
Cc: Jan Beulich <jbeulich@xxxxxxxx>
Cc: Julien Grall <julien.grall@xxxxxxx>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
Cc: Stefano Stabellini <sstabellini@xxxxxxxxxx>
Cc: Tim Deegan <tim@xxxxxxx>
Cc: Wei Liu <wei.liu2@xxxxxxxxxx>
---
Tested with a Linux PVH Dom0 with Intel I350 nic.
---
 xen/drivers/vpci/Makefile |   2 +-
 xen/drivers/vpci/header.c |  67 ++++++++--
 xen/drivers/vpci/sriov.c  | 273 ++++++++++++++++++++++++++++++++++++++
 xen/include/xen/vpci.h    |   9 +-
 4 files changed, 337 insertions(+), 14 deletions(-)
 create mode 100644 xen/drivers/vpci/sriov.c

diff --git a/xen/drivers/vpci/Makefile b/xen/drivers/vpci/Makefile
index 55d1bdfda0..6274f60e74 100644
--- a/xen/drivers/vpci/Makefile
+++ b/xen/drivers/vpci/Makefile
@@ -1 +1 @@
-obj-y += vpci.o header.o msi.o msix.o
+obj-y += vpci.o header.o msi.o msix.o sriov.o
diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index d2188da7f0..24fb469df5 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -183,7 +183,19 @@ static void defer_map(struct domain *d, struct pci_dev 
*pdev,
      * is mapped. This can lead to parallel mapping operations being
      * started for the same device if the domain is not well-behaved.
      */
-    curr->vpci.pdev = pdev;
+    if ( !pdev->info.is_virtfn )
+        curr->vpci.pdev = pdev;
+    else
+    {
+        unsigned int i;
+        /*
+         * Set the BARs as enabled now, for VF the memory decoding is not
+         * controlled by the VF command register.
+         */
+        for ( i = 0; i < ARRAY_SIZE(pdev->vpci->header.bars); i++ )
+            if ( MAPPABLE_BAR(&pdev->vpci->header.bars[i]) )
+                pdev->vpci->header.bars[i].enabled = map;
+    }
     if ( !curr->vpci.mem )
         curr->vpci.mem = mem;
     else
@@ -201,11 +213,11 @@ static void defer_map(struct domain *d, struct pci_dev 
*pdev,
     curr->vpci.rom_only = rom_only;
 }
 
-static int modify_bars(const struct pci_dev *pdev, bool map, bool rom_only)
+int vpci_modify_bars(const struct pci_dev *pdev, bool map, bool rom_only)
 {
     struct vpci_header *header = &pdev->vpci->header;
     struct rangeset *mem = rangeset_new(NULL, NULL, 0);
-    struct pci_dev *tmp, *dev = NULL;
+    struct pci_dev *tmp, *dev = NULL, *parent = NULL;
     const struct vpci_msix *msix = pdev->vpci->msix;
     unsigned int i;
     int rc;
@@ -261,12 +273,29 @@ static int modify_bars(const struct pci_dev *pdev, bool 
map, bool rom_only)
         }
     }
 
+    /* Get the parent dev if it's a VF. */
+    if ( pdev->info.is_virtfn )
+    {
+        pcidevs_lock();
+        parent = pci_get_pdev(pdev->seg, pdev->info.physfn.bus,
+                              pdev->info.physfn.devfn);
+        pcidevs_unlock();
+    }
+
     /*
      * Check for overlaps with other BARs. Note that only BARs that are
      * currently mapped (enabled) are checked for overlaps.
      */
     list_for_each_entry(tmp, &pdev->domain->arch.pdev_list, domain_list)
     {
+        /*
+         * When mapping the BARs of a VF the parent PF is already locked,
+         * trying to lock it will result in a deadlock. This is because
+         * vpci_modify_bars is called from the parent PF control_write register
+         * handler.
+         */
+        bool lock = parent != tmp;
+
         if ( tmp == pdev )
         {
             /*
@@ -283,10 +312,12 @@ static int modify_bars(const struct pci_dev *pdev, bool 
map, bool rom_only)
                 continue;
         }
 
-        spin_lock(&tmp->vpci_lock);
+        if ( lock )
+            spin_lock(&tmp->vpci_lock);
         if ( !tmp->vpci )
         {
-            spin_unlock(&tmp->vpci_lock);
+            if ( lock )
+                spin_unlock(&tmp->vpci_lock);
             continue;
         }
         for ( i = 0; i < ARRAY_SIZE(tmp->vpci->header.bars); i++ )
@@ -307,14 +338,16 @@ static int modify_bars(const struct pci_dev *pdev, bool 
map, bool rom_only)
             rc = rangeset_remove_range(mem, start, end);
             if ( rc )
             {
-                spin_unlock(&tmp->vpci_lock);
+                if ( lock )
+                    spin_unlock(&tmp->vpci_lock);
                 printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
                        start, end, rc);
                 rangeset_destroy(mem);
                 return rc;
             }
         }
-        spin_unlock(&tmp->vpci_lock);
+        if ( lock )
+            spin_unlock(&tmp->vpci_lock);
     }
 
     ASSERT(dev);
@@ -356,7 +389,7 @@ static void cmd_write(const struct pci_dev *pdev, unsigned 
int reg,
          * memory decoding bit has not been changed, so leave everything as-is,
          * hoping the guest will realize and try again.
          */
-        modify_bars(pdev, cmd & PCI_COMMAND_MEMORY, false);
+        vpci_modify_bars(pdev, cmd & PCI_COMMAND_MEMORY, false);
     else
         pci_conf_write16(pdev->seg, pdev->bus, slot, func, reg, cmd);
 }
@@ -437,13 +470,13 @@ static void rom_write(const struct pci_dev *pdev, 
unsigned int reg,
         header->rom_enabled = new_enabled;
         pci_conf_write32(pdev->seg, pdev->bus, slot, func, reg, val);
     }
-    else if ( modify_bars(pdev, new_enabled, true) )
+    else if ( vpci_modify_bars(pdev, new_enabled, true) )
         /*
          * No memory has been added or removed from the p2m (because the actual
          * p2m changes are deferred in defer_map) and the ROM enable bit has
          * not been changed, so leave everything as-is, hoping the guest will
          * realize and try again. It's important to not update rom->addr in the
-         * unmap case if modify_bars has failed, or future attempts would
+         * unmap case if vpci_modify_bars has failed, or future attempts would
          * attempt to unmap the wrong address.
          */
         return;
@@ -465,6 +498,16 @@ static int init_bars(struct pci_dev *pdev)
     };
     int rc;
 
+    if ( pdev->info.is_virtfn )
+        /*
+         * No need to set traps for the command register or the BAR registers
+         * because those are not used by VFs. Memory decoding and position of
+         * the VF BARs is controlled from the PF.
+         *
+         * TODO: add DomU support for VFs.
+         */
+        return 0;
+
     switch ( pci_conf_read8(pdev->seg, pdev->bus, slot, func, PCI_HEADER_TYPE)
              & 0x7f )
     {
@@ -573,7 +616,7 @@ static int init_bars(struct pci_dev *pdev)
             rom->type = VPCI_BAR_EMPTY;
     }
 
-    return (cmd & PCI_COMMAND_MEMORY) ? modify_bars(pdev, true, false) : 0;
+    return (cmd & PCI_COMMAND_MEMORY) ? vpci_modify_bars(pdev, true, false) : 
0;
 }
 
 static void teardown_bars(struct pci_dev *pdev)
@@ -584,7 +627,7 @@ static void teardown_bars(struct pci_dev *pdev)
     if ( cmd & PCI_COMMAND_MEMORY )
     {
         /* Unmap all BARs from guest p2m. */
-        modify_bars(pdev, false, false);
+        vpci_modify_bars(pdev, false, false);
         current->vpci.pdev = NULL;
     }
 }
diff --git a/xen/drivers/vpci/sriov.c b/xen/drivers/vpci/sriov.c
new file mode 100644
index 0000000000..7e8ae70188
--- /dev/null
+++ b/xen/drivers/vpci/sriov.c
@@ -0,0 +1,273 @@
+/*
+ * Handlers for accesses to the SR-IOV capability structure.
+ *
+ * Copyright (C) 2018 Citrix Systems R&D
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/delay.h>
+#include <xen/sched.h>
+#include <xen/vpci.h>
+
+#define SRIOV_SIZE(num) offsetof(struct vpci_sriov, vf[num])
+
+static void modify_memory_mapping(const struct pci_dev *pdev, unsigned int pos,
+                                  bool enable)
+{
+    const struct vpci_sriov *sriov = pdev->vpci->sriov;
+    unsigned int i;
+    int rc;
+
+    if ( enable )
+    {
+        struct pci_dev *pf_dev;
+
+        pcidevs_lock();
+        /*
+         * NB: a non-const pci_dev of the PF is needed in order to update
+         * vf_rlen.
+         */
+        pf_dev = pci_get_pdev(pdev->seg, pdev->bus, pdev->devfn);
+        pcidevs_unlock();
+        ASSERT(pf_dev);
+
+        /* Set the BARs addresses and size. */
+        for ( i = 0; i < PCI_SRIOV_NUM_BARS; i += rc )
+        {
+            unsigned int j, idx = pos + PCI_SRIOV_BAR + i * 4;
+            const pci_sbdf_t sbdf = {
+                .sbdf = PCI_SBDF3(pdev->seg, pdev->bus, pdev->devfn),
+            };
+            uint32_t bar = pci_conf_read32(pdev->seg, pdev->bus,
+                                           PCI_SLOT(pdev->devfn),
+                                           PCI_FUNC(pdev->devfn), idx);
+            uint64_t addr, size;
+
+            rc = pci_size_mem_bar(sbdf, idx, &addr, &size,
+                                  PCI_BAR_VF |
+                                  ((i == PCI_SRIOV_NUM_BARS - 1) ?
+                                   PCI_BAR_LAST : 0));
+            if ( rc <= 0 )
+            {
+                gprintk(XENLOG_ERR,
+                        "%04x:%02x:%02x.%u: failed to size VF BAR\n",
+                        pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                        PCI_FUNC(pdev->devfn));
+                domain_crash(pdev->domain);
+                return;
+            }
+
+            /*
+             * Update vf_rlen on the PF. According to the spec the size of
+             * the BARs can change if the system page size register is
+             * modified, so always update rlen when enabling VFs.
+             */
+            pf_dev->vf_rlen[i] = size;
+
+            for ( j = 0; j < sriov->num_vfs; j++ )
+            {
+                struct vpci_header *header;
+
+                if ( !sriov->vf[j] )
+                    /* Can happen if pci_add_device fails. */
+                    continue;
+
+                spin_lock(&sriov->vf[j]->vpci_lock);
+                header = &sriov->vf[j]->vpci->header;
+
+                if ( !size )
+                {
+                    header->bars[i].type = VPCI_BAR_EMPTY;
+                    spin_unlock(&sriov->vf[j]->vpci_lock);
+                    continue;
+                }
+
+                header->bars[i].addr = addr + size * j;
+                header->bars[i].size = size;
+                header->bars[i].prefetchable =
+                    bar & PCI_BASE_ADDRESS_MEM_PREFETCH;
+
+                switch ( rc )
+                {
+                case 1:
+                    header->bars[i].type = VPCI_BAR_MEM32;
+                    break;
+
+                case 2:
+                    header->bars[i].type = VPCI_BAR_MEM64_LO;
+                    header->bars[i + 1].type = VPCI_BAR_MEM64_HI;
+                    break;
+
+                default:
+                    ASSERT_UNREACHABLE();
+                    spin_unlock(&sriov->vf[j]->vpci_lock);
+                    domain_crash(pdev->domain);
+                    return;
+                }
+                spin_unlock(&sriov->vf[j]->vpci_lock);
+            }
+        }
+    }
+
+    /* Add/remove mappings for the VFs BARs into the p2m. */
+    for ( i = 0; i < sriov->num_vfs; i++ )
+    {
+        struct pci_dev *vf_pdev = sriov->vf[i];
+
+        spin_lock(&vf_pdev->vpci_lock);
+        rc = vpci_modify_bars(vf_pdev, enable, false);
+        spin_unlock(&vf_pdev->vpci_lock);
+        if ( rc )
+            gprintk(XENLOG_ERR,
+                    "failed to %smap BARs of VF %04x:%02x:%02x.%u: %d\n",
+                    enable ? "" : "un", vf_pdev->seg, vf_pdev->bus,
+                    PCI_SLOT(vf_pdev->devfn), PCI_FUNC(vf_pdev->devfn), rc);
+    }
+}
+
+static void control_write(const struct pci_dev *pdev, unsigned int reg,
+                          uint32_t val, void *data)
+{
+    struct vpci_sriov *sriov = data;
+    unsigned int i, pos = reg - PCI_SRIOV_CTRL;
+    uint16_t control = pci_conf_read16(pdev->seg, pdev->bus,
+                                       PCI_SLOT(pdev->devfn),
+                                       PCI_FUNC(pdev->devfn),
+                                       pos + PCI_SRIOV_CTRL);
+    bool enabled = control & PCI_SRIOV_CTRL_VFE;
+    bool mem_enabled = control & PCI_SRIOV_CTRL_MSE;
+    bool new_enabled = val & PCI_SRIOV_CTRL_VFE;
+    bool new_mem_enabled = val & PCI_SRIOV_CTRL_MSE;
+    int rc;
+
+    if ( new_enabled != enabled )
+    {
+        uint16_t offset = pci_conf_read16(pdev->seg, pdev->bus,
+                                          PCI_SLOT(pdev->devfn),
+                                          PCI_FUNC(pdev->devfn),
+                                          pos + PCI_SRIOV_VF_OFFSET);
+        uint16_t stride = pci_conf_read16(pdev->seg, pdev->bus,
+                                          PCI_SLOT(pdev->devfn),
+                                          PCI_FUNC(pdev->devfn),
+                                          pos + PCI_SRIOV_VF_STRIDE);
+
+        if ( new_enabled )
+        {
+            /*
+             * Only update the number of active VFs when enabling, when
+             * disabling use the cached value in order to always remove the
+             * same number of VFs that where active.
+             */
+            sriov->num_vfs = pci_conf_read16(pdev->seg, pdev->bus,
+                                             PCI_SLOT(pdev->devfn),
+                                             PCI_FUNC(pdev->devfn),
+                                             pos + PCI_SRIOV_NUM_VF);
+
+            /*
+             * NB: VFE needs to be enabled before calling pci_add_device so Xen
+             * can access the config space of VFs.
+             */
+            pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                             PCI_FUNC(pdev->devfn), reg,
+                             control | PCI_SRIOV_CTRL_VFE);
+
+            /*
+             * The spec states that the software must wait at least 100ms
+             * before attempting to access VF registers when enabling virtual
+             * functions on the PF.
+             */
+            mdelay(100);
+        }
+
+        for ( i = 0; i < sriov->num_vfs; i++ )
+        {
+            const pci_sbdf_t bdf = {
+                .bdf = PCI_BDF2(pdev->bus, pdev->devfn) + offset + stride * i,
+            };
+
+            if ( new_enabled )
+            {
+                const struct pci_dev_info info = {
+                    .is_virtfn = true,
+                    .physfn.bus = pdev->bus,
+                    .physfn.devfn = pdev->devfn,
+                };
+
+                rc = pci_add_device(pdev->seg, bdf.bus, bdf.extfunc, &info,
+                                    pdev->node);
+            }
+            else
+                rc = pci_remove_device(pdev->seg, bdf.bus, bdf.extfunc);
+            if ( rc )
+                gprintk(XENLOG_ERR, "failed to %s VF %04x:%02x:%02x.%u: %d\n",
+                        new_enabled ? "add" : "remove", pdev->seg, bdf.bus,
+                        bdf.dev, bdf.func, rc);
+
+            pcidevs_lock();
+            sriov->vf[i] = pci_get_pdev(pdev->seg, bdf.bus, bdf.extfunc);
+            pcidevs_unlock();
+        }
+
+        if ( new_mem_enabled )
+            modify_memory_mapping(pdev, pos, true);
+    }
+    else if ( new_mem_enabled != mem_enabled && new_enabled )
+        modify_memory_mapping(pdev, pos, new_mem_enabled);
+
+    pci_conf_write16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                     PCI_FUNC(pdev->devfn), reg, val);
+}
+
+static int init_sriov(struct pci_dev *pdev)
+{
+    unsigned int pos = pci_find_ext_capability(pdev->seg, pdev->bus,
+                                               pdev->devfn,
+                                               PCI_EXT_CAP_ID_SRIOV);
+    uint16_t total_vfs;
+
+    if ( !pos )
+        return 0;
+
+    total_vfs = pci_conf_read16(pdev->seg, pdev->bus, PCI_SLOT(pdev->devfn),
+                                PCI_FUNC(pdev->devfn),
+                                pos + PCI_SRIOV_TOTAL_VF);
+
+    pdev->vpci->sriov = xzalloc_bytes(SRIOV_SIZE(total_vfs));
+    if ( !pdev->vpci->sriov )
+        return -ENOMEM;
+
+    return vpci_add_register(pdev->vpci, vpci_hw_read16, control_write,
+                             pos + PCI_SRIOV_CTRL, 2, pdev->vpci->sriov);
+}
+
+static void teardown_sriov(struct pci_dev *pdev)
+{
+    if ( pdev->vpci->sriov )
+    {
+        /* TODO: removing PFs is not currently supported. */
+        ASSERT_UNREACHABLE();
+        domain_crash(pdev->domain);
+    }
+}
+REGISTER_VPCI_INIT(init_sriov, teardown_sriov, VPCI_PRIORITY_LOW);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index 208667227e..bd92c7cd12 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -94,7 +94,6 @@ struct vpci {
          * is mapped into guest p2m) if there's a ROM BAR on the device.
          */
         bool rom_enabled      : 1;
-        /* FIXME: currently there's no support for SR-IOV. */
     } header;
 
     /* MSI data. */
@@ -144,6 +143,11 @@ struct vpci {
             struct vpci_arch_msix_entry arch;
         } entries[];
     } *msix;
+
+    struct vpci_sriov {
+        uint16_t num_vfs;
+        struct pci_dev *vf[];
+    } *sriov;
 #endif
 };
 
@@ -214,6 +218,9 @@ static inline unsigned int vmsix_entry_nr(const struct 
vpci_msix *msix,
 {
     return entry - msix->entries;
 }
+
+/* Map/unmap the BARs of a vPCI device. */
+int vpci_modify_bars(const struct pci_dev *pdev, bool map, bool rom_only);
 #endif /* __XEN__ */
 
 #else /* !CONFIG_HAS_VPCI */
-- 
2.17.1


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.