[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen stable-4.12] AMD/IOMMU: re-assign devices directly



commit 1a250caadb837748e4a797019fc3cb0f790c9bab
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Tue Apr 5 15:38:19 2022 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Tue Apr 5 15:38:19 2022 +0200

    AMD/IOMMU: re-assign devices directly
    
    Devices with unity map ranges, due to it being unspecified how/when
    these memory ranges may get accessed, may not be left disconnected from
    their unity mappings (as long as it's not certain that the device has
    been fully quiesced). Hence rather than tearing down the old root page
    table pointer and then establishing the new one, re-assignment needs to
    be done in a single step.
    
    This is CVE-2022-26360 / part of XSA-400.
    
    Reported-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
    
    Similarly quarantining scratch-page mode relies on page tables to be
    continuously wired up.
    
    To avoid complicating things more than necessary, treat all devices
    mostly equally, i.e. regardless of their association with any unity map
    ranges.  The main difference is when it comes to updating DTEs, which need
    to be atomic when there are unity mappings. Yet atomicity can only be
    achieved with CMPXCHG16B, availability of which we can't take for given.
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Reviewed-by: Paul Durrant <paul@xxxxxxx>
    Reviewed-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
    master commit: 1fa6e9aa36233fe9c29a204fcb2697e985b8345f
    master date: 2022-04-05 14:18:04 +0200
---
 xen/drivers/passthrough/amd/iommu_map.c       | 116 ++++++++++++++++-
 xen/drivers/passthrough/amd/pci_amd_iommu.c   | 173 ++++++++++++++++++--------
 xen/include/asm-x86/hvm/svm/amd-iommu-proto.h |   8 +-
 3 files changed, 239 insertions(+), 58 deletions(-)

diff --git a/xen/drivers/passthrough/amd/iommu_map.c 
b/xen/drivers/passthrough/amd/iommu_map.c
index a5492da9fb..2986ae5fc9 100644
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -158,12 +158,105 @@ static unsigned int set_iommu_pte_present(unsigned long 
pt_mfn,
     return flush_flags;
 }
 
-void amd_iommu_set_root_page_table(uint32_t *dte, uint64_t root_ptr,
-                                   uint16_t domain_id, uint8_t paging_mode,
-                                   uint8_t valid)
+/*
+ * This function returns
+ * - -errno for errors,
+ * - 0 for a successful update, atomic when necessary
+ * - 1 for a successful but non-atomic update, which may need to be warned
+ *   about by the caller.
+ */
+int amd_iommu_set_root_page_table(uint32_t *dte, uint64_t root_ptr,
+                                  uint16_t domain_id, uint8_t paging_mode,
+                                  unsigned int flags)
 {
+    bool valid = flags & SET_ROOT_VALID;
     uint32_t addr_hi, addr_lo, entry, dte0 = dte[0];
 
+    addr_lo = root_ptr & DMA_32BIT_MASK;
+    addr_hi = root_ptr >> 32;
+
+    if ( get_field_from_reg_u32(dte0, IOMMU_DEV_TABLE_VALID_MASK,
+                                IOMMU_DEV_TABLE_VALID_SHIFT) &&
+         get_field_from_reg_u32(dte0, IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK,
+                                IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT) &&
+         (cpu_has_cx16 || (flags & SET_ROOT_WITH_UNITY_MAP)) )
+    {
+        union {
+            uint32_t dte[4];
+            uint64_t raw64[2];
+            __uint128_t raw128;
+        } ldte;
+        __uint128_t old;
+        int ret = 0;
+
+        memcpy(ldte.dte, dte, sizeof(ldte));
+        old = ldte.raw128;
+
+        set_field_in_reg_u32(domain_id, ldte.dte[2],
+                             IOMMU_DEV_TABLE_DOMAIN_ID_MASK,
+                             IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &ldte.dte[2]);
+
+        set_field_in_reg_u32(addr_hi, ldte.dte[1],
+                             IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK,
+                             IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT,
+                             &ldte.dte[1]);
+        set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, ldte.dte[1],
+                             IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_MASK,
+                             IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_SHIFT,
+                             &ldte.dte[1]);
+        set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, ldte.dte[1],
+                             IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK,
+                             IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT,
+                             &ldte.dte[1]);
+
+        set_field_in_reg_u32(addr_lo >> PAGE_SHIFT, ldte.dte[0],
+                             IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK,
+                             IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT,
+                             &ldte.dte[0]);
+        set_field_in_reg_u32(paging_mode, ldte.dte[0],
+                             IOMMU_DEV_TABLE_PAGING_MODE_MASK,
+                             IOMMU_DEV_TABLE_PAGING_MODE_SHIFT, &ldte.dte[0]);
+        set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, ldte.dte[0],
+                             IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK,
+                             IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT,
+                             &ldte.dte[0]);
+        set_field_in_reg_u32(valid ? IOMMU_CONTROL_ENABLED
+                                   : IOMMU_CONTROL_DISABLED,
+                             ldte.dte[0], IOMMU_DEV_TABLE_VALID_MASK,
+                             IOMMU_DEV_TABLE_VALID_SHIFT, &ldte.dte[0]);
+
+        if ( cpu_has_cx16 )
+        {
+            __uint128_t res = cmpxchg16b(dte, &old, &ldte.raw128);
+
+            /*
+             * Hardware does not update the DTE behind our backs, so the
+             * return value should match "old".
+             */
+            if ( res != old )
+            {
+                printk(XENLOG_ERR
+                       "Dom%d: unexpected DTE %016lx_%016lx (expected 
%016lx_%016lx)\n",
+                       domain_id,
+                       (uint64_t)(res >> 64), (uint64_t)res,
+                       (uint64_t)(old >> 64), (uint64_t)old);
+                ret = -EILSEQ;
+            }
+        }
+        else /* Best effort, updating domain_id last. */
+        {
+            uint64_t *ptr = (void *)dte;
+
+            write_atomic(ptr + 0, ldte.raw64[0]);
+            /* No barrier should be needed between these two. */
+            write_atomic(ptr + 1, ldte.raw64[1]);
+
+            ret = 1;
+        }
+
+        return ret;
+    }
+
     if ( valid ||
          get_field_from_reg_u32(dte0, IOMMU_DEV_TABLE_VALID_MASK,
                                 IOMMU_DEV_TABLE_VALID_SHIFT) )
@@ -183,9 +276,6 @@ void amd_iommu_set_root_page_table(uint32_t *dte, uint64_t 
root_ptr,
                          IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry);
     dte[2] = entry;
 
-    addr_lo = root_ptr & DMA_32BIT_MASK;
-    addr_hi = root_ptr >> 32;
-
     set_field_in_reg_u32(addr_hi, 0,
                          IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK,
                          IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT, &entry);
@@ -212,6 +302,20 @@ void amd_iommu_set_root_page_table(uint32_t *dte, uint64_t 
root_ptr,
                          IOMMU_DEV_TABLE_VALID_MASK,
                          IOMMU_DEV_TABLE_VALID_SHIFT, &entry);
     write_atomic(&dte[0], entry);
+
+    return 0;
+}
+
+paddr_t amd_iommu_get_root_page_table(const uint32_t *dte)
+{
+    uint32_t lo = get_field_from_reg_u32(
+                      dte[0], IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK,
+                      IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT);
+    uint32_t hi = get_field_from_reg_u32(
+                      dte[1], IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK,
+                      IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT);
+
+    return ((paddr_t)hi << 32) | (lo << PAGE_SHIFT);
 }
 
 void iommu_dte_set_iotlb(uint32_t *dte, uint8_t i)
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c 
b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index e9153a5271..3cbb2e9b61 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -109,22 +109,60 @@ static void disable_translation(u32 *dte)
     dte[0] = entry;
 }
 
-static void amd_iommu_setup_domain_device(
+static int __must_check allocate_domain_resources(struct domain_iommu *hd)
+{
+    int rc;
+
+    spin_lock(&hd->arch.mapping_lock);
+    rc = amd_iommu_alloc_root(hd);
+    spin_unlock(&hd->arch.mapping_lock);
+
+    return rc;
+}
+
+static bool any_pdev_behind_iommu(const struct domain *d,
+                                  const struct pci_dev *exclude,
+                                  const struct amd_iommu *iommu)
+{
+    const struct pci_dev *pdev;
+
+    for_each_pdev ( d, pdev )
+    {
+        if ( pdev == exclude )
+            continue;
+
+        if ( find_iommu_for_device(pdev->seg,
+                                   PCI_BDF2(pdev->bus, pdev->devfn)) == iommu )
+            return true;
+    }
+
+    return false;
+}
+
+static int __must_check amd_iommu_setup_domain_device(
     struct domain *domain, struct amd_iommu *iommu,
     u8 devfn, struct pci_dev *pdev)
 {
-    void *dte;
+    uint32_t *dte;
     unsigned long flags;
-    int req_id, valid = 1;
-    int dte_i = 0;
+    unsigned int req_id, sr_flags;
+    int dte_i = 0, rc;
     u8 bus = pdev->bus;
-    const struct domain_iommu *hd = dom_iommu(domain);
+    struct domain_iommu *hd = dom_iommu(domain);
+    const struct ivrs_mappings *ivrs_dev;
+
+    BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer);
 
-    BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
-            !iommu->dev_table.buffer );
+    rc = allocate_domain_resources(hd);
+    if ( rc )
+        return rc;
 
-    if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
-        valid = 0;
+    req_id = get_dma_requestor_id(iommu->seg,
+                                  PCI_BDF2(pdev->bus, pdev->devfn));
+    ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+    sr_flags = (iommu_hwdom_passthrough && is_hardware_domain(domain)
+                ? 0 : SET_ROOT_VALID)
+               | (ivrs_dev->unity_map ? SET_ROOT_WITH_UNITY_MAP : 0);
 
     if ( ats_enabled )
         dte_i = 1;
@@ -132,32 +170,87 @@ static void amd_iommu_setup_domain_device(
     /* get device-table entry */
     req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
     dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+    ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
 
     spin_lock_irqsave(&iommu->lock, flags);
 
     if ( !is_translation_valid((u32 *)dte) )
     {
         /* bind DTE to domain page-tables */
-        amd_iommu_set_root_page_table(
-            (u32 *)dte, page_to_maddr(hd->arch.root_table), domain->domain_id,
-            hd->arch.paging_mode, valid);
+        rc = amd_iommu_set_root_page_table(
+                 dte, page_to_maddr(hd->arch.root_table),
+                 domain->domain_id, hd->arch.paging_mode, sr_flags);
+        if ( rc )
+        {
+            ASSERT(rc < 0);
+            spin_unlock_irqrestore(&iommu->lock, flags);
+            return rc;
+        }
 
         if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
              iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
             iommu_dte_set_iotlb((u32 *)dte, dte_i);
 
         amd_iommu_flush_device(iommu, req_id);
+    }
+    else if ( amd_iommu_get_root_page_table(dte) !=
+              page_to_maddr(hd->arch.root_table) )
+    {
+        /*
+         * Strictly speaking if the device is the only one with this requestor
+         * ID, it could be allowed to be re-assigned regardless of unity map
+         * presence.  But let's deal with that case only if it is actually
+         * found in the wild.
+         */
+        if ( req_id != PCI_BDF2(bus, devfn) &&
+             (sr_flags & SET_ROOT_WITH_UNITY_MAP) )
+            rc = -EOPNOTSUPP;
+        else
+            rc = amd_iommu_set_root_page_table(
+                     dte, page_to_maddr(hd->arch.root_table),
+                     domain->domain_id, hd->arch.paging_mode, sr_flags);
+        if ( rc < 0 )
+        {
+            spin_unlock_irqrestore(&iommu->lock, flags);
+            return rc;
+        }
+        if ( rc &&
+             domain != pdev->domain &&
+             /*
+              * By non-atomically updating the DTE's domain ID field last,
+              * during a short window in time TLB entries with the old domain
+              * ID but the new page tables may have been inserted.  This could
+              * affect I/O of other devices using this same (old) domain ID.
+              * Such updating therefore is not a problem if this was the only
+              * device associated with the old domain ID.  Diverting I/O of any
+              * of a dying domain's devices to the quarantine page tables is
+              * intended anyway.
+              */
+             !pdev->domain->is_dying &&
+             (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
+              pdev->phantom_stride) )
+            printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data 
corruption\n",
+                   pdev->seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+                   pdev->domain);
 
-        AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
-                        "root table = %#"PRIx64", "
-                        "domain = %d, paging mode = %d\n",
-                        req_id, pdev->type,
-                        page_to_maddr(hd->arch.root_table),
-                        domain->domain_id, hd->arch.paging_mode);
+        if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
+             iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
+            ASSERT(get_field_from_reg_u32(
+                       dte[3], IOMMU_DEV_TABLE_IOTLB_SUPPORT_MASK,
+                       IOMMU_DEV_TABLE_IOTLB_SUPPORT_SHIFT) == dte_i);
+
+        amd_iommu_flush_device(iommu, req_id);
     }
 
     spin_unlock_irqrestore(&iommu->lock, flags);
 
+    AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
+                    "root table = %#"PRIx64", "
+                    "domain = %d, paging mode = %d\n",
+                    req_id, pdev->type,
+                    page_to_maddr(hd->arch.root_table),
+                    domain->domain_id, hd->arch.paging_mode);
+
     ASSERT(pcidevs_locked());
 
     if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
@@ -168,6 +261,8 @@ static void amd_iommu_setup_domain_device(
 
         amd_iommu_flush_iotlb(devfn, pdev, INV_IOMMU_ALL_PAGES_ADDRESS, 0);
     }
+
+    return 0;
 }
 
 int __init amd_iov_detect(void)
@@ -211,17 +306,6 @@ int amd_iommu_alloc_root(struct domain_iommu *hd)
     return 0;
 }
 
-static int __must_check allocate_domain_resources(struct domain_iommu *hd)
-{
-    int rc;
-
-    spin_lock(&hd->arch.mapping_lock);
-    rc = amd_iommu_alloc_root(hd);
-    spin_unlock(&hd->arch.mapping_lock);
-
-    return rc;
-}
-
 int __read_mostly amd_iommu_min_paging_mode = 1;
 
 static int amd_iommu_domain_init(struct domain *d)
@@ -310,7 +394,6 @@ static int reassign_device(struct domain *source, struct 
domain *target,
 {
     struct amd_iommu *iommu;
     int bdf, rc;
-    struct domain_iommu *t = dom_iommu(target);
     const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
 
     bdf = PCI_BDF2(pdev->bus, pdev->devfn);
@@ -324,7 +407,15 @@ static int reassign_device(struct domain *source, struct 
domain *target,
         return -ENODEV;
     }
 
-    amd_iommu_disable_domain_device(source, iommu, devfn, pdev);
+    rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
+    if ( rc )
+        return rc;
+
+    if ( devfn == pdev->devfn && pdev->domain != target )
+    {
+        list_move(&pdev->domain_list, &target->arch.pdev_list);
+        pdev->domain = target;
+    }
 
     /*
      * If the device belongs to the hardware domain, and it has a unity 
mapping,
@@ -340,27 +431,10 @@ static int reassign_device(struct domain *source, struct 
domain *target,
             return rc;
     }
 
-    if ( devfn == pdev->devfn && pdev->domain != dom_io )
-    {
-        list_move(&pdev->domain_list, &dom_io->arch.pdev_list);
-        pdev->domain = dom_io;
-    }
-
-    rc = allocate_domain_resources(t);
-    if ( rc )
-        return rc;
-
-    amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
     AMD_IOMMU_DEBUG("Re-assign %04x:%02x:%02x.%u from dom%d to dom%d\n",
                     pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
                     source->domain_id, target->domain_id);
 
-    if ( devfn == pdev->devfn && pdev->domain != target )
-    {
-        list_move(&pdev->domain_list, &target->arch.pdev_list);
-        pdev->domain = target;
-    }
-
     return 0;
 }
 
@@ -491,8 +565,7 @@ static int amd_iommu_add_device(u8 devfn, struct pci_dev 
*pdev)
         return -ENODEV;
     }
 
-    amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
-    return 0;
+    return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
 }
 
 static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h 
b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
index 6c83810fc3..a3211ccdfb 100644
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -78,8 +78,12 @@ void amd_iommu_share_p2m(struct domain *d);
 int get_dma_requestor_id(u16 seg, u16 bdf);
 void amd_iommu_set_intremap_table(
     u32 *dte, u64 intremap_ptr, u8 int_valid);
-void amd_iommu_set_root_page_table(
-    u32 *dte, u64 root_ptr, u16 domain_id, u8 paging_mode, u8 valid);
+#define SET_ROOT_VALID          (1u << 0)
+#define SET_ROOT_WITH_UNITY_MAP (1u << 1)
+int __must_check amd_iommu_set_root_page_table(
+    uint32_t *dte, uint64_t root_ptr, uint16_t domain_id, uint8_t paging_mode,
+    unsigned int flags);
+paddr_t amd_iommu_get_root_page_table(const uint32_t *dte);
 void iommu_dte_set_iotlb(u32 *dte, u8 i);
 void iommu_dte_add_device_entry(u32 *dte, struct ivrs_mappings *ivrs_dev);
 void iommu_dte_set_guest_cr3(u32 *dte, u16 dom_id, u64 gcr3,
--
generated by git-patchbot for /home/xen/git/xen.git#stable-4.12



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.