[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v7 06/12] vpci/header: handle p2m range sets per BAR


  • To: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>
  • From: Volodymyr Babchuk <Volodymyr_Babchuk@xxxxxxxx>
  • Date: Tue, 13 Jun 2023 10:32:28 +0000
  • Accept-language: en-US
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=epam.com; dmarc=pass action=none header.from=epam.com; dkim=pass header.d=epam.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=8fpDlfsuLlhQ46EF1ZKjojVmfi+Mg0Uj5DZlq5hKl7Q=; b=CVziUQyJa5V5DbfS8EB8w4vSquLqS+7z3YSZX9BZzIvSc2q81SfQSCavXa68GHUyR8Mlg2DTNoo/fwGZHp2X+20Xu31gvBRQGc9Z/bWM59TJ9BfJDjymVeJ0aLwEu3+YvBxT+7EIJFMQt2jX/cNEkGQIApzc9esh5Qb6DY9E3UTFZO8SIiy4lg9wvSTY+hjNB2CzSbBmq5eOfc/+ojmRFVV5V1ycaQObXRCcI1n1OE97rKjnx8xxmAGUrZpGTRYRbCB2bRJ+60aOjpzIabsk5zhbXXaUMF+/9MHRGQdM7WMDWD/7ba2y7FbjbDWbZWhJEwJaghrPAEyM18yb4r69hQ==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=i682FnO/2vFekdQsH3XqEGLFo7+c87BEyM7znEYjuBIMEoCFzw7bLDfQJpDL9XASAuYmVXsJnCQ5OI1jV9woXxZJ4CUEPSkZWX8ASCcq0Us1EyHYz4RHPSfC4mJNzcFEHcNsiVz5rgaoovAdIzyMffHye+uGPW8xUzqUx7xGHWFVwt8EdkfWDnncQ8+3oY2tT5xM3QJVvz/tkyYhZlS4p+AP8hprDe4ye7OGd7odyJ9Lg1STNTumJW/X5xmtZgxTKe/6uLztztpZ5U2JBVgf3KpZhVMWPBkJ950rRuZtEbKXOwL4vVOu2nNyv3OAM/ezlvI28dQb5oPAJZquUEr/xQ==
  • Cc: Oleksandr Andrushchenko <Oleksandr_Andrushchenko@xxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>
  • Delivery-date: Tue, 13 Jun 2023 10:33:04 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>
  • Thread-index: AQHZneJa4eA0vie3HEayzZAXloxz2A==
  • Thread-topic: [PATCH v7 06/12] vpci/header: handle p2m range sets per BAR

From: Oleksandr Andrushchenko <oleksandr_andrushchenko@xxxxxxxx>

Instead of handling a single range set, that contains all the memory
regions of all the BARs and ROM, have them per BAR.
As the range sets are now created when a PCI device is added and destroyed
when it is removed so make them named and accounted.

Note that rangesets were chosen here despite there being only up to
3 separate ranges in each set (typically just 1). But rangeset per BAR
was chosen for the ease of implementation and existing code re-usability.

This is in preparation of making non-identity mappings in p2m for the MMIOs.

Signed-off-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@xxxxxxxx>

---
Since v6:
- update according to the new locking scheme
- remove odd fail label in modify_bars
Since v5:
- fix comments
- move rangeset allocation to init_bars and only allocate
  for MAPPABLE BARs
- check for overlap with the already setup BAR ranges
Since v4:
- use named range sets for BARs (Jan)
- changes required by the new locking scheme
- updated commit message (Jan)
Since v3:
- re-work vpci_cancel_pending accordingly to the per-BAR handling
- s/num_mem_ranges/map_pending and s/uint8_t/bool
- ASSERT(bar->mem) in modify_bars
- create and destroy the rangesets on add/remove
---
 xen/drivers/vpci/header.c | 238 +++++++++++++++++++++++++++-----------
 xen/drivers/vpci/vpci.c   |   5 +
 xen/include/xen/vpci.h    |   3 +-
 3 files changed, 180 insertions(+), 66 deletions(-)

diff --git a/xen/drivers/vpci/header.c b/xen/drivers/vpci/header.c
index 8eebbf968b..8e722857d6 100644
--- a/xen/drivers/vpci/header.c
+++ b/xen/drivers/vpci/header.c
@@ -141,63 +141,102 @@ static void modify_decoding(const struct pci_dev *pdev, 
uint16_t cmd,
 
 bool vpci_process_pending(struct vcpu *v)
 {
-    if ( v->vpci.mem )
+    struct pci_dev *pdev = v->vpci.pdev;
+
+    if ( !pdev )
+        return false;
+
+    read_lock(&v->domain->vpci_rwlock);
+
+    if ( v->vpci.map_pending )
     {
         struct map_data data = {
             .d = v->domain,
             .map = v->vpci.cmd & PCI_COMMAND_MEMORY,
         };
-        int rc = rangeset_consume_ranges(v->vpci.mem, map_range, &data);
-
-        if ( rc == -ERESTART )
-            return true;
-
-        read_lock(&v->domain->vpci_rwlock);
-        spin_lock(&v->vpci.pdev->vpci->lock);
-        /* Disable memory decoding unconditionally on failure. */
-        modify_decoding(v->vpci.pdev,
-                        rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY : v->vpci.cmd,
-                        !rc && v->vpci.rom_only);
-        spin_unlock(&v->vpci.pdev->vpci->lock);
-        read_unlock(&v->domain->vpci_rwlock);
-
-        rangeset_destroy(v->vpci.mem);
-        v->vpci.mem = NULL;
-        if ( rc )
-            /*
-             * FIXME: in case of failure remove the device from the domain.
-             * Note that there might still be leftover mappings. While this is
-             * safe for Dom0, for DomUs the domain will likely need to be
-             * killed in order to avoid leaking stale p2m mappings on
-             * failure.
-             */
-            vpci_remove_device(v->vpci.pdev);
+        struct vpci_header *header = &pdev->vpci->header;
+        unsigned int i;
+
+        for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+        {
+            struct vpci_bar *bar = &header->bars[i];
+            int rc;
+
+            if ( rangeset_is_empty(bar->mem) )
+                continue;
+
+            rc = rangeset_consume_ranges(bar->mem, map_range, &data);
+
+            if ( rc == -ERESTART )
+            {
+                read_unlock(&v->domain->vpci_rwlock);
+                return true;
+            }
+
+            spin_lock(&pdev->vpci->lock);
+            /* Disable memory decoding unconditionally on failure. */
+            modify_decoding(pdev, rc ? v->vpci.cmd & ~PCI_COMMAND_MEMORY :
+                                       v->vpci.cmd, !rc && v->vpci.rom_only);
+            spin_unlock(&pdev->vpci->lock);
+
+            if ( rc )
+            {
+                /*
+                 * FIXME: in case of failure remove the device from the domain.
+                 * Note that there might still be leftover mappings. While this
+                 * is safe for Dom0, for DomUs the domain needs to be killed in
+                 * order to avoid leaking stale p2m mappings on failure.
+                 */
+                v->vpci.map_pending = false;
+                read_unlock(&v->domain->vpci_rwlock);
+
+                if ( is_hardware_domain(v->domain) )
+                    vpci_remove_device(pdev);
+                else
+                    domain_crash(v->domain);
+
+                return false;
+            }
+        }
+
+        v->vpci.map_pending = false;
     }
 
+    read_unlock(&v->domain->vpci_rwlock);
+
     return false;
 }
 
 static int __init apply_map(struct domain *d, const struct pci_dev *pdev,
-                            struct rangeset *mem, uint16_t cmd)
+                            uint16_t cmd)
 {
     struct map_data data = { .d = d, .map = true };
-    int rc;
+    struct vpci_header *header = &pdev->vpci->header;
+    int rc = 0;
+    unsigned int i;
 
     ASSERT(rw_is_write_locked(&d->vpci_rwlock));
 
-    while ( (rc = rangeset_consume_ranges(mem, map_range, &data)) == -ERESTART 
)
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
     {
-        /*
-         * It's safe to drop and reacquire the lock in this context
-         * without risking pdev disappearing because devices cannot be
-         * removed until the initial domain has been started.
-         */
-        write_unlock(&d->vpci_rwlock);
-        process_pending_softirqs();
-        write_lock(&d->vpci_rwlock);
-    }
+        struct vpci_bar *bar = &header->bars[i];
+
+        if ( rangeset_is_empty(bar->mem) )
+            continue;
 
-    rangeset_destroy(mem);
+        while ( (rc = rangeset_consume_ranges(bar->mem, map_range,
+                                              &data)) == -ERESTART )
+        {
+            /*
+             * It's safe to drop and reacquire the lock in this context
+             * without risking pdev disappearing because devices cannot be
+             * removed until the initial domain has been started.
+             */
+            write_unlock(&d->vpci_rwlock);
+            process_pending_softirqs();
+            write_lock(&d->vpci_rwlock);
+        }
+    }
     if ( !rc )
         modify_decoding(pdev, cmd, false);
 
@@ -205,10 +244,12 @@ static int __init apply_map(struct domain *d, const 
struct pci_dev *pdev,
 }
 
 static void defer_map(struct domain *d, struct pci_dev *pdev,
-                      struct rangeset *mem, uint16_t cmd, bool rom_only)
+                      uint16_t cmd, bool rom_only)
 {
     struct vcpu *curr = current;
 
+    ASSERT(!!rw_is_write_locked(&pdev->domain->vpci_rwlock));
+
     /*
      * FIXME: when deferring the {un}map the state of the device should not
      * be trusted. For example the enable bit is toggled after the device
@@ -216,7 +257,7 @@ static void defer_map(struct domain *d, struct pci_dev 
*pdev,
      * started for the same device if the domain is not well-behaved.
      */
     curr->vpci.pdev = pdev;
-    curr->vpci.mem = mem;
+    curr->vpci.map_pending = true;
     curr->vpci.cmd = cmd;
     curr->vpci.rom_only = rom_only;
     /*
@@ -231,30 +272,31 @@ static void defer_map(struct domain *d, struct pci_dev 
*pdev,
 static int modify_bars(const struct pci_dev *pdev, uint16_t cmd, bool rom_only)
 {
     struct vpci_header *header = &pdev->vpci->header;
-    struct rangeset *mem = rangeset_new(NULL, NULL, 0);
     struct pci_dev *tmp, *dev = NULL;
     const struct vpci_msix *msix = pdev->vpci->msix;
-    unsigned int i;
+    unsigned int i, j;
     int rc;
-
-    if ( !mem )
-        return -ENOMEM;
+    bool map_pending;
 
     /*
-     * Create a rangeset that represents the current device BARs memory region
-     * and compare it against all the currently active BAR memory regions. If
-     * an overlap is found, subtract it from the region to be mapped/unmapped.
+     * Create a rangeset per BAR that represents the current device memory
+     * region and compare it against all the currently active BAR memory
+     * regions. If an overlap is found, subtract it from the region to be
+     * mapped/unmapped.
      *
-     * First fill the rangeset with all the BARs of this device or with the ROM
+     * First fill the rangesets with the BARs of this device or with the ROM
      * BAR only, depending on whether the guest is toggling the memory decode
      * bit of the command register, or the enable bit of the ROM BAR register.
      */
     for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
     {
-        const struct vpci_bar *bar = &header->bars[i];
+        struct vpci_bar *bar = &header->bars[i];
         unsigned long start = PFN_DOWN(bar->addr);
         unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
 
+        if ( !bar->mem )
+            continue;
+
         if ( !MAPPABLE_BAR(bar) ||
              (rom_only ? bar->type != VPCI_BAR_ROM
                        : (bar->type == VPCI_BAR_ROM && !header->rom_enabled)) 
||
@@ -270,14 +312,31 @@ static int modify_bars(const struct pci_dev *pdev, 
uint16_t cmd, bool rom_only)
             continue;
         }
 
-        rc = rangeset_add_range(mem, start, end);
+        rc = rangeset_add_range(bar->mem, start, end);
         if ( rc )
         {
             printk(XENLOG_G_WARNING "Failed to add [%lx, %lx]: %d\n",
                    start, end, rc);
-            rangeset_destroy(mem);
             return rc;
         }
+
+        /* Check for overlap with the already setup BAR ranges. */
+        for ( j = 0; j < i; j++ )
+        {
+            struct vpci_bar *bar = &header->bars[j];
+
+            if ( rangeset_is_empty(bar->mem) )
+                continue;
+
+            rc = rangeset_remove_range(bar->mem, start, end);
+            if ( rc )
+            {
+                printk(XENLOG_G_WARNING
+                       "Failed to remove overlapping range [%lx, %lx]: %d\n",
+                       start, end, rc);
+                return rc;
+            }
+        }
     }
 
     /* Remove any MSIX regions if present. */
@@ -287,14 +346,21 @@ static int modify_bars(const struct pci_dev *pdev, 
uint16_t cmd, bool rom_only)
         unsigned long end = PFN_DOWN(vmsix_table_addr(pdev->vpci, i) +
                                      vmsix_table_size(pdev->vpci, i) - 1);
 
-        rc = rangeset_remove_range(mem, start, end);
-        if ( rc )
+        for ( j = 0; j < ARRAY_SIZE(header->bars); j++ )
         {
-            printk(XENLOG_G_WARNING
-                   "Failed to remove MSIX table [%lx, %lx]: %d\n",
-                   start, end, rc);
-            rangeset_destroy(mem);
-            return rc;
+            const struct vpci_bar *bar = &header->bars[j];
+
+            if ( rangeset_is_empty(bar->mem) )
+                continue;
+
+            rc = rangeset_remove_range(bar->mem, start, end);
+            if ( rc )
+            {
+                printk(XENLOG_G_WARNING
+                       "Failed to remove MSIX table [%lx, %lx]: %d\n",
+                       start, end, rc);
+                return rc;
+            }
         }
     }
 
@@ -327,7 +393,8 @@ static int modify_bars(const struct pci_dev *pdev, uint16_t 
cmd, bool rom_only)
             unsigned long start = PFN_DOWN(bar->addr);
             unsigned long end = PFN_DOWN(bar->addr + bar->size - 1);
 
-            if ( !bar->enabled || !rangeset_overlaps_range(mem, start, end) ||
+            if ( !bar->enabled ||
+                 !rangeset_overlaps_range(bar->mem, start, end) ||
                  /*
                   * If only the ROM enable bit is toggled check against other
                   * BARs in the same device for overlaps, but not against the
@@ -336,12 +403,11 @@ static int modify_bars(const struct pci_dev *pdev, 
uint16_t cmd, bool rom_only)
                  (rom_only && tmp == pdev && bar->type == VPCI_BAR_ROM) )
                 continue;
 
-            rc = rangeset_remove_range(mem, start, end);
+            rc = rangeset_remove_range(bar->mem, start, end);
             if ( rc )
             {
                 printk(XENLOG_G_WARNING "Failed to remove [%lx, %lx]: %d\n",
                        start, end, rc);
-                rangeset_destroy(mem);
                 pcidevs_unlock();
                 return rc;
             }
@@ -362,10 +428,23 @@ static int modify_bars(const struct pci_dev *pdev, 
uint16_t cmd, bool rom_only)
          * will always be to establish mappings and process all the BARs.
          */
         ASSERT((cmd & PCI_COMMAND_MEMORY) && !rom_only);
-        return apply_map(pdev->domain, pdev, mem, cmd);
+        return apply_map(pdev->domain, pdev, cmd);
     }
 
-    defer_map(dev->domain, dev, mem, cmd, rom_only);
+    /* Find out how many memory ranges has left after MSI and overlaps. */
+    map_pending = false;
+    for ( i = 0; i < ARRAY_SIZE(header->bars); i++ )
+        if ( !rangeset_is_empty(header->bars[i].mem) )
+        {
+            map_pending = true;
+            break;
+        }
+
+    /* If there's no mapping work write the command register now. */
+    if ( !map_pending )
+        pci_conf_write16(pdev->sbdf, PCI_COMMAND, cmd);
+    else
+        defer_map(dev->domain, dev, cmd, rom_only);
 
     return 0;
 }
@@ -556,6 +635,19 @@ static void cf_check rom_write(
         rom->addr = val & PCI_ROM_ADDRESS_MASK;
 }
 
+static int bar_add_rangeset(struct pci_dev *pdev, struct vpci_bar *bar, int i)
+{
+    char str[32];
+
+    snprintf(str, sizeof(str), "%pp:BAR%d", &pdev->sbdf, i);
+
+    bar->mem = rangeset_new(pdev->domain, str, RANGESETF_no_print);
+    if ( !bar->mem )
+        return -ENOMEM;
+
+    return 0;
+}
+
 static int cf_check init_bars(struct pci_dev *pdev)
 {
     uint16_t cmd;
@@ -639,6 +731,13 @@ static int cf_check init_bars(struct pci_dev *pdev)
         else
             bars[i].type = VPCI_BAR_MEM32;
 
+        rc = bar_add_rangeset(pdev, &bars[i], i);
+        if ( rc )
+        {
+            bars[i].type = VPCI_BAR_EMPTY;
+            return rc;
+        }
+
         rc = pci_size_mem_bar(pdev->sbdf, reg, &addr, &size,
                               (i == num_bars - 1) ? PCI_BAR_LAST : 0);
         if ( rc < 0 )
@@ -690,6 +789,15 @@ static int cf_check init_bars(struct pci_dev *pdev)
                                    rom_reg, 4, rom);
             if ( rc )
                 rom->type = VPCI_BAR_EMPTY;
+            else
+            {
+                rc = bar_add_rangeset(pdev, rom, i);
+                if ( rc )
+                {
+                    rom->type = VPCI_BAR_EMPTY;
+                    return rc;
+                }
+            }
 
             header->rom_reg = rom_reg;
         }
diff --git a/xen/drivers/vpci/vpci.c b/xen/drivers/vpci/vpci.c
index 4182c65eaa..b542ddaf7b 100644
--- a/xen/drivers/vpci/vpci.c
+++ b/xen/drivers/vpci/vpci.c
@@ -39,6 +39,7 @@ extern vpci_register_init_t *const __end_vpci_array[];
 void vpci_remove_device(struct pci_dev *pdev)
 {
     struct vpci *vpci;
+    unsigned int i;
 
     if ( !has_vpci(pdev->domain) )
         return;
@@ -73,6 +74,10 @@ void vpci_remove_device(struct pci_dev *pdev)
             if ( pdev->vpci->msix->table[i] )
                 iounmap(pdev->vpci->msix->table[i]);
     }
+
+    for ( i = 0; i < ARRAY_SIZE(vpci->header.bars); i++ )
+        rangeset_destroy(vpci->header.bars[i].mem);
+
     xfree(vpci->msix);
     xfree(vpci->msi);
     xfree(vpci);
diff --git a/xen/include/xen/vpci.h b/xen/include/xen/vpci.h
index e9170cc8ca..1e42a59c1d 100644
--- a/xen/include/xen/vpci.h
+++ b/xen/include/xen/vpci.h
@@ -75,6 +75,7 @@ struct vpci {
             /* Guest view of the BAR: address and lower bits. */
             uint64_t guest_reg;
             uint64_t size;
+            struct rangeset *mem;
             enum {
                 VPCI_BAR_EMPTY,
                 VPCI_BAR_IO,
@@ -162,9 +163,9 @@ struct vpci {
 
 struct vpci_vcpu {
     /* Per-vcpu structure to store state while {un}mapping of PCI BARs. */
-    struct rangeset *mem;
     struct pci_dev *pdev;
     uint16_t cmd;
+    bool map_pending : 1;
     bool rom_only : 1;
 };
 
-- 
2.40.1



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.