[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen stable-4.1] x86/MSI: add mechanism to fully protect MSI-X table from PV guest accesses



commit b56fe0b46f98afe918455bae193d543d3ffc4598
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Tue Mar 12 16:46:09 2013 +0100
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Tue Mar 12 16:46:09 2013 +0100

    x86/MSI: add mechanism to fully protect MSI-X table from PV guest accesses
    
    This adds two new physdev operations for Dom0 to invoke when resource
    allocation for devices is known to be complete, so that the hypervisor
    can arrange for the respective MMIO ranges to be marked read-only
    before an eventual guest getting such a device assigned even gets
    started, such that it won't be able to set up writable mappings for
    these MMIO ranges before Xen has a chance to protect them.
    
    This also addresses another issue with the code being modified here,
    in that so far write protection for the address ranges in question got
    set up only once during the lifetime of a device (i.e. until either
    system shutdown or device hot removal), while teardown happened when
    the last interrupt was disposed of by the guest (which at least allowed
    the tables to be writable when the device got assigned to a second
    guest [instance] after the first terminated).
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    master changeset: 4245d331e0e75de8d1bddbbb518f3a8ce6d0bb7e
    master date: 2013-03-08 14:05:34 +0100
---
 xen/arch/x86/msi.c           |  198 +++++++++++++++++++++++++++---------------
 xen/arch/x86/physdev.c       |   14 +++
 xen/include/asm-x86/msi.h    |    1 +
 xen/include/public/physdev.h |   15 +++
 xen/include/xen/pci.h        |    1 +
 5 files changed, 160 insertions(+), 69 deletions(-)

diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 1579dfc..ef332fb 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -602,8 +602,8 @@ static u64 read_pci_mem_bar(u8 bus, u8 slot, u8 func, u8 
bir, int vf)
  * @entries: pointer to an array of struct msix_entry entries
  * @nvec: number of @entries
  *
- * Setup the MSI-X capability structure of device function with a
- * single MSI-X irq. A return of zero indicates the successful setup of
+ * Setup the MSI-X capability structure of device function with the requested
+ * number MSI-X irqs. A return of zero indicates the successful setup of
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev,
@@ -611,84 +611,67 @@ static int msix_capability_init(struct pci_dev *dev,
                                 struct msi_desc **desc,
                                 unsigned int nr_entries)
 {
-    struct msi_desc *entry;
-    int pos;
+    struct msi_desc *entry = NULL;
+    int pos, vf;
     u16 control;
-    u64 table_paddr, entry_paddr;
-    u32 table_offset, entry_offset;
-    u8 bir;
-    void __iomem *base;
-    int idx;
+    u64 table_paddr;
+    u32 table_offset;
+    u8 bir, pbus, pslot, pfunc;
     u8 bus = dev->bus;
     u8 slot = PCI_SLOT(dev->devfn);
     u8 func = PCI_FUNC(dev->devfn);
 
     ASSERT(spin_is_locked(&pcidevs_lock));
-    ASSERT(desc);
 
     pos = pci_find_cap_offset(bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
     msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
 
-    /* MSI-X Table Initialization */
-    entry = alloc_msi_entry();
-    if ( !entry )
-        return -ENOMEM;
+    if ( desc )
+    {
+        entry = alloc_msi_entry();
+        if ( !entry )
+            return -ENOMEM;
+        ASSERT(msi);
+    }
 
-    /* Request & Map MSI-X table region */
+    /* Locate MSI-X table region */
     table_offset = pci_conf_read32(bus, slot, func, 
msix_table_offset_reg(pos));
     bir = (u8)(table_offset & PCI_MSIX_BIRMASK);
     table_offset &= ~PCI_MSIX_BIRMASK;
-    entry_offset = msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
 
-    table_paddr = msi->table_base + table_offset;
-    entry_paddr = table_paddr + entry_offset;
-    idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
-    if ( idx < 0 )
+    if ( !dev->info.is_virtfn )
     {
-        xfree(entry);
-        return idx;
+        pbus = bus;
+        pslot = slot;
+        pfunc = func;
+        vf = -1;
     }
-    base = (void *)(fix_to_virt(idx) +
-        ((unsigned long)entry_paddr & ((1UL << PAGE_SHIFT) - 1)));
-
-    entry->msi_attrib.type = PCI_CAP_ID_MSIX;
-    entry->msi_attrib.is_64 = 1;
-    entry->msi_attrib.entry_nr = msi->entry_nr;
-    entry->msi_attrib.maskbit = 1;
-    entry->msi_attrib.masked = 1;
-    entry->msi_attrib.pos = pos;
-    entry->irq = msi->irq;
-    entry->dev = dev;
-    entry->mask_base = base;
-
-    list_add_tail(&entry->list, &dev->msi_list);
-
-    if ( !dev->msix_nr_entries )
+    else
     {
-        u8 pbus, pslot, pfunc;
-        int vf;
-        u64 pba_paddr;
-        u32 pba_offset;
+        pbus = dev->info.physfn.bus;
+        pslot = PCI_SLOT(dev->info.physfn.devfn);
+        pfunc = PCI_FUNC(dev->info.physfn.devfn);
+        vf = PCI_BDF2(dev->bus, dev->devfn);
+    }
 
-        if ( !dev->info.is_virtfn )
-        {
-            pbus = bus;
-            pslot = slot;
-            pfunc = func;
-            vf = -1;
-        }
-        else
+    table_paddr = read_pci_mem_bar(pbus, pslot, pfunc, bir, vf);
+    WARN_ON(msi && msi->table_base != table_paddr);
+    if ( !table_paddr )
+    {
+        if ( !msi || !msi->table_base )
         {
-            pbus = dev->info.physfn.bus;
-            pslot = PCI_SLOT(dev->info.physfn.devfn);
-            pfunc = PCI_FUNC(dev->info.physfn.devfn);
-            vf = PCI_BDF2(dev->bus, dev->devfn);
+            xfree(entry);
+            return -ENXIO;
         }
+        table_paddr = msi->table_base;
+    }
+    table_paddr += table_offset;
 
-        ASSERT(!dev->msix_used_entries);
-        WARN_ON(msi->table_base !=
-                read_pci_mem_bar(pbus, pslot, pfunc, bir, vf));
+    if ( !dev->msix_used_entries )
+    {
+        u64 pba_paddr;
+        u32 pba_offset;
 
         dev->msix_nr_entries = nr_entries;
         dev->msix_table.first = PFN_DOWN(table_paddr);
@@ -709,7 +692,42 @@ static int msix_capability_init(struct pci_dev *dev,
                                       BITS_TO_LONGS(nr_entries) - 1);
         WARN_ON(rangeset_overlaps_range(mmio_ro_ranges, dev->msix_pba.first,
                                         dev->msix_pba.last));
+    }
+
+    if ( entry )
+    {
+        /* Map MSI-X table region */
+        u64 entry_paddr = table_paddr + msi->entry_nr * PCI_MSIX_ENTRY_SIZE;
+        int idx = msix_get_fixmap(dev, table_paddr, entry_paddr);
+        void __iomem *base;
+
+        if ( idx < 0 )
+        {
+            xfree(entry);
+            return idx;
+        }
+        base = (void *)(fix_to_virt(idx) +
+                        ((unsigned long)entry_paddr & (PAGE_SIZE - 1)));
+
+        /* Mask interrupt here */
+        writel(1, base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+
+        entry->msi_attrib.type = PCI_CAP_ID_MSIX;
+        entry->msi_attrib.is_64 = 1;
+        entry->msi_attrib.entry_nr = msi->entry_nr;
+        entry->msi_attrib.maskbit = 1;
+        entry->msi_attrib.masked = 1;
+        entry->msi_attrib.pos = pos;
+        entry->irq = msi->irq;
+        entry->dev = dev;
+        entry->mask_base = base;
+
+        list_add_tail(&entry->list, &dev->msi_list);
+        *desc = entry;
+    }
 
+    if ( !dev->msix_used_entries )
+    {
         if ( rangeset_add_range(mmio_ro_ranges, dev->msix_table.first,
                                 dev->msix_table.last) )
             WARN();
@@ -720,7 +738,7 @@ static int msix_capability_init(struct pci_dev *dev,
         if ( dev->domain )
             p2m_change_entry_type_global(p2m_get_hostp2m(dev->domain),
                                          p2m_mmio_direct, p2m_mmio_direct);
-        if ( !dev->domain || !paging_mode_translate(dev->domain) )
+        if ( desc && (!dev->domain || !paging_mode_translate(dev->domain)) )
         {
             struct domain *d = dev->domain;
 
@@ -734,6 +752,13 @@ static int msix_capability_init(struct pci_dev *dev,
                         break;
             if ( d )
             {
+                if ( !IS_PRIV(d) && dev->msix_warned != d->domain_id )
+                {
+                    dev->msix_warned = d->domain_id;
+                    printk(XENLOG_ERR
+                           "Potentially insecure use of MSI-X on %02x:%02x.%u 
by Dom%d\n",
+                           bus, slot, func, d->domain_id);
+                }
                 /* XXX How to deal with existing mappings? */
             }
         }
@@ -742,10 +767,6 @@ static int msix_capability_init(struct pci_dev *dev,
     WARN_ON(dev->msix_table.first != (table_paddr >> PAGE_SHIFT));
     ++dev->msix_used_entries;
 
-    /* Mask interrupt here */
-    writel(1, entry->mask_base + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
-
-    *desc = entry;
     /* Restore MSI-X enabled bits */
     pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
 
@@ -876,6 +897,19 @@ static int __pci_enable_msix(struct msi_info *msi, struct 
msi_desc **desc)
     return status;
 }
 
+static void _pci_cleanup_msix(struct pci_dev *dev)
+{
+    if ( !--dev->msix_used_entries )
+    {
+        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
+                                   dev->msix_table.last) )
+            WARN();
+        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
+                                   dev->msix_pba.last) )
+            WARN();
+    }
+}
+
 static void __pci_disable_msix(struct msi_desc *entry)
 {
     struct pci_dev *dev;
@@ -898,15 +932,41 @@ static void __pci_disable_msix(struct msi_desc *entry)
 
     pci_conf_write16(bus, slot, func, msix_control_reg(pos), control);
 
-    if ( !--dev->msix_used_entries )
+    _pci_cleanup_msix(dev);
+}
+
+int pci_prepare_msix(u8 bus, u8 devfn, bool_t off)
+{
+    int rc;
+    struct pci_dev *pdev;
+    u8 slot = PCI_SLOT(devfn), func = PCI_FUNC(devfn);
+    unsigned int pos = pci_find_cap_offset(bus, slot, func,
+                                           PCI_CAP_ID_MSIX);
+
+    if ( !pos )
+        return -ENODEV;
+
+    spin_lock(&pcidevs_lock);
+    pdev = pci_get_pdev(bus, devfn);
+    if ( !pdev )
+        rc = -ENODEV;
+    else if ( pdev->msix_used_entries != !!off )
+        rc = -EBUSY;
+    else if ( off )
     {
-        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_table.first,
-                                  dev->msix_table.last) )
-            WARN();
-        if ( rangeset_remove_range(mmio_ro_ranges, dev->msix_pba.first,
-                                   dev->msix_pba.last) )
-            WARN();
+        _pci_cleanup_msix(pdev);
+        rc = 0;
     }
+    else
+    {
+        u16 control = pci_conf_read16(bus, slot, func, msix_control_reg(pos));
+
+        rc = msix_capability_init(pdev, NULL, NULL,
+                                  multi_msix_capable(control));
+    }
+    spin_unlock(&pcidevs_lock);
+
+    return rc;
 }
 
 /*
diff --git a/xen/arch/x86/physdev.c b/xen/arch/x86/physdev.c
index de364f0..8feb84a 100644
--- a/xen/arch/x86/physdev.c
+++ b/xen/arch/x86/physdev.c
@@ -548,6 +548,20 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_HANDLE(void) arg)
         break;
     }
 
+    case PHYSDEVOP_prepare_msix:
+    case PHYSDEVOP_release_msix: {
+        struct physdev_pci_device dev;
+
+        if ( copy_from_guest(&dev, arg, 1) )
+            ret = -EFAULT;
+        else if ( dev.seg )
+            ret = -EOPNOTSUPP;
+        else
+            ret = pci_prepare_msix(dev.bus, dev.devfn,
+                                   cmd != PHYSDEVOP_prepare_msix);
+        break;
+    }
+
     case PHYSDEVOP_restore_msi: {
         struct physdev_restore_msi restore_msi;
         struct pci_dev *pdev;
diff --git a/xen/include/asm-x86/msi.h b/xen/include/asm-x86/msi.h
index dc402b9..bd83a57 100644
--- a/xen/include/asm-x86/msi.h
+++ b/xen/include/asm-x86/msi.h
@@ -75,6 +75,7 @@ extern void unmask_msi_irq(unsigned int irq);
 extern void set_msi_affinity(unsigned int vector, cpumask_t mask);
 extern int pci_enable_msi(struct msi_info *msi, struct msi_desc **desc);
 extern void pci_disable_msi(struct msi_desc *desc);
+extern int pci_prepare_msix(u8 bus, u8 devfn, bool_t off);
 extern void pci_cleanup_msi(struct pci_dev *pdev);
 extern int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq);
 extern void teardown_msi_irq(int irq);
diff --git a/xen/include/public/physdev.h b/xen/include/public/physdev.h
index 6f792ee..d65f72b 100644
--- a/xen/include/public/physdev.h
+++ b/xen/include/public/physdev.h
@@ -264,6 +264,21 @@ typedef struct physdev_get_free_pirq 
physdev_get_free_pirq_t;
 DEFINE_XEN_GUEST_HANDLE(physdev_get_free_pirq_t);
 
 /*
+ * Dom0 should use these two to announce MMIO resources assigned to
+ * MSI-X capable devices won't (prepare) or may (release) change.
+ */
+#define PHYSDEVOP_prepare_msix          30
+#define PHYSDEVOP_release_msix          31
+struct physdev_pci_device {
+    /* IN */
+    uint16_t seg;
+    uint8_t bus;
+    uint8_t devfn;
+};
+typedef struct physdev_pci_device physdev_pci_device_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_t);
+
+/*
  * Notify that some PIRQ-bound event channels have been unmasked.
  * ** This command is obsolete since interface version 0x00030202 and is **
  * ** unsupported by newer versions of Xen.                              **
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index e1b9cdd..3b8067e 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -57,6 +57,7 @@ struct pci_dev {
     int msix_table_refcnt[MAX_MSIX_TABLE_PAGES];
     int msix_table_idx[MAX_MSIX_TABLE_PAGES];
     spinlock_t msix_table_lock;
+    domid_t msix_warned;
 
     struct domain *domain;
     const u8 bus;
--
generated by git-patchbot for /home/xen/git/xen.git#stable-4.1

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.