[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[RFC PATCH] pci: introduce per-domain PCI rwlock


  • To: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>
  • From: Volodymyr Babchuk <Volodymyr_Babchuk@xxxxxxxx>
  • Date: Tue, 11 Jul 2023 00:46:04 +0000
  • Accept-language: en-US
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=epam.com; dmarc=pass action=none header.from=epam.com; dkim=pass header.d=epam.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=Oq7MAyYNyIyiERDgN5/HwhrVdFHh73QKIalzw9eg63s=; b=JeNcy17B76YEwFSEQ5yGmEVQ77ujrMtzJCLKHmathUF/GJsTe9ZBVK0/K9qGGZjaaHlmh4fUfFLWf8tyaZCUOOmR9DauLBvbQ9hMFEl967gf63xqDJr7yGJHBIdl9E20MFA3ZRtVdNpeqRzvXtJKRL0gTYxNFFKFmjY8H0y2lXiQBaBeA0LaySSygy/MeTsojMGzrP4zuNrOaIljTPNkOQNWPZty1+b2NTJ0P2GheUHJo89tk44/v5Xvz9UvJWqo1eIh+Cfalx/yLzcmw4abBlfCpnov5YDM3KsL06PCSZm6NSDDZN3gfL+JgOnMUX+9ODPlifEQksqayU9QlPAplA==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=bJPJYcqxTCFVvHGHGvIiaYFGlyRS5xWVzdFnALeaLMbYD2O2wuJGP+H8axavqMQE9w2jX0H9P1UpwHA6k/EGVWd52lA5pQKnA4sFS2sp/tBjSG/ZHJ7A2dRjXHIdjyNAwqHBldROZhG4nlbhJULfSXGO+tFYcvAO3mtHdvp9rp47kJfVHKfrLxspCXyZYF7YaNW016UHM+6Y7fAtdqFYH365ogdL2qtLvJ9CojGbvAIsT5PhP9cdYfW0wgWoHW9fhooOXmWRdESuwOZD4v0azCI6DUj62OVcHg7HSMoOkNnel4Y9iwukNMQP/P6WOlkvBLYWVaeNGN4qDdsQ50WNvg==
  • Cc: Volodymyr Babchuk <Volodymyr_Babchuk@xxxxxxxx>, Jan Beulich <jbeulich@xxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Jun Nakajima <jun.nakajima@xxxxxxxxx>, Kevin Tian <kevin.tian@xxxxxxxxx>, Paul Durrant <paul@xxxxxxx>, Oleksandr Tyshchenko <olekstysh@xxxxxxxxx>, Oleksandr Andrushchenko <Oleksandr_Andrushchenko@xxxxxxxx>
  • Delivery-date: Tue, 11 Jul 2023 00:46:40 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>
  • Thread-index: AQHZs5ESsjHhLtlMU0qzCbDGds6cOw==
  • Thread-topic: [RFC PATCH] pci: introduce per-domain PCI rwlock

Add per-domain d->pci_lock that protects access to
d->pdev_list. Purpose of this lock is to give guarantees to VPCI code
that underlying pdev will not disappear under feet. Later it will also
protect pdev->vpci structure and pdev access in modify_bars().

Suggested-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
Suggested-by: Jan Beulich <jbeulich@xxxxxxxx>
Signed-off-by: Volodymyr Babchuk <volodymyr_babchuk@xxxxxxxx>

---

This patch should be part of VPCI series, but I am posting it as a
sinle-patch RFC to discuss changes to x86 MM and IOMMU code.

I opted to factor out part of the changes from "vpci: introduce
per-domain lock to protect vpci structure" commit to ease up review process.
---
 xen/arch/x86/hvm/hvm.c                      |  2 +
 xen/arch/x86/hvm/vmx/vmcs.c                 |  2 +
 xen/arch/x86/mm.c                           |  6 ++
 xen/arch/x86/mm/p2m-pod.c                   |  7 +++
 xen/arch/x86/mm/paging.c                    |  6 ++
 xen/common/domain.c                         |  1 +
 xen/drivers/passthrough/amd/iommu_cmd.c     |  4 +-
 xen/drivers/passthrough/amd/pci_amd_iommu.c | 15 ++++-
 xen/drivers/passthrough/pci.c               | 70 +++++++++++++++++----
 xen/drivers/passthrough/vtd/iommu.c         | 19 +++++-
 xen/include/xen/sched.h                     |  1 +
 11 files changed, 117 insertions(+), 16 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index a67ef79dc0..089fbe38a7 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2381,12 +2381,14 @@ int hvm_set_cr0(unsigned long value, bool may_defer)
         }
     }
 
+    read_lock(&d->pci_lock);
     if ( ((value ^ old_value) & X86_CR0_CD) &&
          is_iommu_enabled(d) && hvm_funcs.handle_cd &&
          (!rangeset_is_empty(d->iomem_caps) ||
           !rangeset_is_empty(d->arch.ioport_caps) ||
           has_arch_pdevs(d)) )
         alternative_vcall(hvm_funcs.handle_cd, v, value);
+    read_unlock(&d->pci_lock);
 
     hvm_update_cr(v, 0, value);
 
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
index b209563625..88bbcbbd99 100644
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1889,6 +1889,7 @@ void cf_check vmx_do_resume(void)
          *  2: execute wbinvd on all dirty pCPUs when guest wbinvd exits.
          * If VT-d engine can force snooping, we don't need to do these.
          */
+        read_lock(&v->domain->pci_lock);
         if ( has_arch_pdevs(v->domain) && !iommu_snoop
                 && !cpu_has_wbinvd_exiting )
         {
@@ -1896,6 +1897,7 @@ void cf_check vmx_do_resume(void)
             if ( cpu != -1 )
                 flush_mask(cpumask_of(cpu), FLUSH_CACHE);
         }
+        read_unlock(&v->domain->pci_lock);
 
         vmx_clear_vmcs(v);
         vmx_load_vmcs(v);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index be2b10a391..f1e882a980 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -858,12 +858,15 @@ get_page_from_l1e(
         return 0;
     }
 
+    read_lock(&l1e_owner->pci_lock);
     if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
     {
         gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
                  l1f & l1_disallow_mask(l1e_owner));
+        read_unlock(&l1e_owner->pci_lock);
         return -EINVAL;
     }
+    read_unlock(&l1e_owner->pci_lock);
 
     valid = mfn_valid(_mfn(mfn));
 
@@ -2142,12 +2145,15 @@ static int mod_l1_entry(l1_pgentry_t *pl1e, 
l1_pgentry_t nl1e,
     {
         struct page_info *page = NULL;
 
+        read_lock(&pt_dom->pci_lock);
         if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
         {
             gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
                     l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
+            read_unlock(&pt_dom->pci_lock);
             return -EINVAL;
         }
+        read_unlock(&pt_dom->pci_lock);
 
         /* Translate foreign guest address. */
         if ( cmd != MMU_PT_UPDATE_NO_TRANSLATE &&
diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
index 9969eb45fa..07e0bedad7 100644
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -349,10 +349,12 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long 
target)
 
     ASSERT( pod_target >= p2m->pod.count );
 
+    read_lock(&d->pci_lock);
     if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
         ret = -ENOTEMPTY;
     else
         ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
+    read_unlock(&d->pci_lock);
 
 out:
     pod_unlock(p2m);
@@ -1401,8 +1403,13 @@ guest_physmap_mark_populate_on_demand(struct domain *d, 
unsigned long gfn,
     if ( !paging_mode_translate(d) )
         return -EINVAL;
 
+    read_lock(&d->pci_lock);
     if ( has_arch_pdevs(d) || cache_flush_permitted(d) )
+    {
+        read_unlock(&d->pci_lock);
         return -ENOTEMPTY;
+    }
+    read_unlock(&d->pci_lock);
 
     do {
         rc = mark_populate_on_demand(d, gfn, chunk_order);
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c
index 34d833251b..fb8f7ff7cf 100644
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -205,21 +205,27 @@ static int paging_log_dirty_enable(struct domain *d)
 {
     int ret;
 
+    read_lock(&d->pci_lock);
     if ( has_arch_pdevs(d) )
     {
         /*
          * Refuse to turn on global log-dirty mode
          * if the domain is sharing the P2M with the IOMMU.
          */
+        read_unlock(&d->pci_lock);
         return -EINVAL;
     }
 
     if ( paging_mode_log_dirty(d) )
+    {
+        read_unlock(&d->pci_lock);
         return -EINVAL;
+    }
 
     domain_pause(d);
     ret = d->arch.paging.log_dirty.ops->enable(d);
     domain_unpause(d);
+    read_unlock(&d->pci_lock);
 
     return ret;
 }
diff --git a/xen/common/domain.c b/xen/common/domain.c
index caaa402637..5d8a8836da 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -645,6 +645,7 @@ struct domain *domain_create(domid_t domid,
 
 #ifdef CONFIG_HAS_PCI
     INIT_LIST_HEAD(&d->pdev_list);
+    rwlock_init(&d->pci_lock);
 #endif
 
     /* All error paths can depend on the above setup. */
diff --git a/xen/drivers/passthrough/amd/iommu_cmd.c 
b/xen/drivers/passthrough/amd/iommu_cmd.c
index 40ddf366bb..b67aee31f6 100644
--- a/xen/drivers/passthrough/amd/iommu_cmd.c
+++ b/xen/drivers/passthrough/amd/iommu_cmd.c
@@ -308,11 +308,12 @@ void amd_iommu_flush_iotlb(u8 devfn, const struct pci_dev 
*pdev,
     flush_command_buffer(iommu, iommu_dev_iotlb_timeout);
 }
 
-static void amd_iommu_flush_all_iotlbs(const struct domain *d, daddr_t daddr,
+static void amd_iommu_flush_all_iotlbs(struct domain *d, daddr_t daddr,
                                        unsigned int order)
 {
     struct pci_dev *pdev;
 
+    read_lock(&d->pci_lock);
     for_each_pdev( d, pdev )
     {
         u8 devfn = pdev->devfn;
@@ -323,6 +324,7 @@ static void amd_iommu_flush_all_iotlbs(const struct domain 
*d, daddr_t daddr,
         } while ( devfn != pdev->devfn &&
                   PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
     }
+    read_unlock(&d->pci_lock);
 }
 
 /* Flush iommu cache after p2m changes. */
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c 
b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index 94e3775506..8541b66a93 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -102,6 +102,8 @@ static bool any_pdev_behind_iommu(const struct domain *d,
 {
     const struct pci_dev *pdev;
 
+    ASSERT(rw_is_locked(&d->pci_lock));
+
     for_each_pdev ( d, pdev )
     {
         if ( pdev == exclude )
@@ -467,17 +469,24 @@ static int cf_check reassign_device(
 
     if ( !QUARANTINE_SKIP(target, pdev) )
     {
+       read_lock(&target->pci_lock);
         rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
         if ( rc )
             return rc;
+       read_unlock(&target->pci_lock);
     }
     else
         amd_iommu_disable_domain_device(source, iommu, devfn, pdev);
 
     if ( devfn == pdev->devfn && pdev->domain != target )
     {
-        list_move(&pdev->domain_list, &target->pdev_list);
-        pdev->domain = target;
+        write_lock(&pdev->domain->pci_lock);
+        list_del(&pdev->domain_list);
+        write_unlock(&pdev->domain->pci_lock);
+
+        write_lock(&target->pci_lock);
+        list_add(&pdev->domain_list, &target->pdev_list);
+        write_unlock(&target->pci_lock);
     }
 
     /*
@@ -628,12 +637,14 @@ static int cf_check amd_iommu_add_device(u8 devfn, struct 
pci_dev *pdev)
         fresh_domid = true;
     }
 
+    read_lock(&pdev->domain->pci_lock);
     ret = amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
     if ( ret && fresh_domid )
     {
         iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
         pdev->arch.pseudo_domid = DOMID_INVALID;
     }
+    read_unlock(&pdev->domain->pci_lock);
 
     return ret;
 }
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 07d1986d33..1831e1b0c0 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -454,7 +454,9 @@ static void __init _pci_hide_device(struct pci_dev *pdev)
     if ( pdev->domain )
         return;
     pdev->domain = dom_xen;
+    write_lock(&dom_xen->pci_lock);
     list_add(&pdev->domain_list, &dom_xen->pdev_list);
+    write_unlock(&dom_xen->pci_lock);
 }
 
 int __init pci_hide_device(unsigned int seg, unsigned int bus,
@@ -530,7 +532,7 @@ struct pci_dev *pci_get_pdev(const struct domain *d, 
pci_sbdf_t sbdf)
 {
     struct pci_dev *pdev;
 
-    ASSERT(d || pcidevs_locked());
+    ASSERT((d && rw_is_locked(&d->pci_lock)) || pcidevs_locked());
 
     /*
      * The hardware domain owns the majority of the devices in the system.
@@ -748,7 +750,9 @@ int pci_add_device(u16 seg, u8 bus, u8 devfn,
     if ( !pdev->domain )
     {
         pdev->domain = hardware_domain;
+        write_lock(&hardware_domain->pci_lock);
         list_add(&pdev->domain_list, &hardware_domain->pdev_list);
+        write_unlock(&hardware_domain->pci_lock);
 
         /*
          * For devices not discovered by Xen during boot, add vPCI handlers
@@ -887,26 +891,62 @@ static int deassign_device(struct domain *d, uint16_t 
seg, uint8_t bus,
 
 int pci_release_devices(struct domain *d)
 {
-    struct pci_dev *pdev, *tmp;
-    u8 bus, devfn;
-    int ret;
+    int combined_ret;
+    LIST_HEAD(failed_pdevs);
 
     pcidevs_lock();
-    ret = arch_pci_clean_pirqs(d);
-    if ( ret )
+    write_lock(&d->pci_lock);
+    combined_ret = arch_pci_clean_pirqs(d);
+    if ( combined_ret )
     {
         pcidevs_unlock();
-        return ret;
+        write_unlock(&d->pci_lock);
+        return combined_ret;
     }
-    list_for_each_entry_safe ( pdev, tmp, &d->pdev_list, domain_list )
+
+    while ( !list_empty(&d->pdev_list) )
     {
-        bus = pdev->bus;
-        devfn = pdev->devfn;
-        ret = deassign_device(d, pdev->seg, bus, devfn) ?: ret;
+        struct pci_dev *pdev = list_first_entry(&d->pdev_list,
+                                                struct pci_dev,
+                                                domain_list);
+        uint16_t seg = pdev->seg;
+        uint8_t bus = pdev->bus;
+        uint8_t devfn = pdev->devfn;
+        int ret;
+
+        write_unlock(&d->pci_lock);
+        ret = deassign_device(d, seg, bus, devfn);
+        write_lock(&d->pci_lock);
+        if ( ret )
+        {
+            bool still_present = false;
+            const struct pci_dev *tmp;
+
+            /*
+             * We need to check if deassign_device() left our pdev in
+             * domain's list. As we dropped the lock, we can't be sure
+             * that list wasn't permutated in some random way, so we
+             * need to traverse the whole list.
+             */
+            for_each_pdev ( d, tmp )
+            {
+                if ( tmp == pdev )
+                {
+                    still_present = true;
+                    break;
+                }
+            }
+            if ( still_present )
+                list_move(&pdev->domain_list, &failed_pdevs);
+            combined_ret = ret;
+        }
     }
+
+    list_splice(&failed_pdevs, &d->pdev_list);
+    write_unlock(&d->pci_lock);
     pcidevs_unlock();
 
-    return ret;
+    return combined_ret;
 }
 
 #define PCI_CLASS_BRIDGE_HOST    0x0600
@@ -1125,7 +1165,9 @@ static int __hwdom_init cf_check _setup_hwdom_pci_devices(
             if ( !pdev->domain )
             {
                 pdev->domain = ctxt->d;
+                write_lock(&ctxt->d->pci_lock);
                 list_add(&pdev->domain_list, &ctxt->d->pdev_list);
+                write_unlock(&ctxt->d->pci_lock);
                 setup_one_hwdom_device(ctxt, pdev);
             }
             else if ( pdev->domain == dom_xen )
@@ -1487,6 +1529,7 @@ static int iommu_get_device_group(
         return group_id;
 
     pcidevs_lock();
+    read_lock(&d->pci_lock);
     for_each_pdev( d, pdev )
     {
         unsigned int b = pdev->bus;
@@ -1501,6 +1544,7 @@ static int iommu_get_device_group(
         sdev_id = iommu_call(ops, get_device_group_id, seg, b, df);
         if ( sdev_id < 0 )
         {
+            read_unlock(&d->pci_lock);
             pcidevs_unlock();
             return sdev_id;
         }
@@ -1511,6 +1555,7 @@ static int iommu_get_device_group(
 
             if ( unlikely(copy_to_guest_offset(buf, i, &bdf, 1)) )
             {
+                read_unlock(&d->pci_lock);
                 pcidevs_unlock();
                 return -EFAULT;
             }
@@ -1518,6 +1563,7 @@ static int iommu_get_device_group(
         }
     }
 
+    read_unlock(&d->pci_lock);
     pcidevs_unlock();
 
     return i;
diff --git a/xen/drivers/passthrough/vtd/iommu.c 
b/xen/drivers/passthrough/vtd/iommu.c
index 0e3062c820..6a36cc18fe 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -186,6 +186,8 @@ static bool any_pdev_behind_iommu(const struct domain *d,
 {
     const struct pci_dev *pdev;
 
+    ASSERT(rw_is_locked(&d->pci_lock));
+
     for_each_pdev ( d, pdev )
     {
         const struct acpi_drhd_unit *drhd;
@@ -2765,6 +2767,7 @@ static int cf_check reassign_device_ownership(
 
     if ( !QUARANTINE_SKIP(target, pdev->arch.vtd.pgd_maddr) )
     {
+        read_lock(&target->pci_lock);
         if ( !has_arch_pdevs(target) )
             vmx_pi_hooks_assign(target);
 
@@ -2780,21 +2783,26 @@ static int cf_check reassign_device_ownership(
 #endif
 
         ret = domain_context_mapping(target, devfn, pdev);
+        read_unlock(&target->pci_lock);
 
         if ( !ret && pdev->devfn == devfn &&
              !QUARANTINE_SKIP(source, pdev->arch.vtd.pgd_maddr) )
         {
             const struct acpi_drhd_unit *drhd = 
acpi_find_matched_drhd_unit(pdev);
 
+            read_lock(&source->pci_lock);
             if ( drhd )
                 check_cleanup_domid_map(source, pdev, drhd->iommu);
+            read_unlock(&source->pci_lock);
         }
     }
     else
     {
         const struct acpi_drhd_unit *drhd;
 
+        read_lock(&source->pci_lock);
         drhd = domain_context_unmap(source, devfn, pdev);
+        read_unlock(&source->pci_lock);
         ret = IS_ERR(drhd) ? PTR_ERR(drhd) : 0;
     }
     if ( ret )
@@ -2806,12 +2814,21 @@ static int cf_check reassign_device_ownership(
 
     if ( devfn == pdev->devfn && pdev->domain != target )
     {
-        list_move(&pdev->domain_list, &target->pdev_list);
+        write_lock(&pdev->domain->pci_lock);
+        list_del(&pdev->domain_list);
+        write_unlock(&pdev->domain->pci_lock);
+
+        write_lock(&target->pci_lock);
+        list_add(&pdev->domain_list, &target->pdev_list);
+        write_unlock(&target->pci_lock);
+
         pdev->domain = target;
     }
 
+    read_lock(&source->pci_lock);
     if ( !has_arch_pdevs(source) )
         vmx_pi_hooks_deassign(source);
+    read_unlock(&source->pci_lock);
 
     /*
      * If the device belongs to the hardware domain, and it has RMRR, don't
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 85242a73d3..80dd150bbf 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -460,6 +460,7 @@ struct domain
 
 #ifdef CONFIG_HAS_PCI
     struct list_head pdev_list;
+    rwlock_t pci_lock;
 #endif
 
 #ifdef CONFIG_HAS_PASSTHROUGH
-- 
2.41.0

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.