[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[xen stable-4.12] IOMMU/x86: use per-device page tables for quarantining



commit c633ec9451e76015c409bd5119ffcb0f2e61fe8b
Author:     Jan Beulich <jbeulich@xxxxxxxx>
AuthorDate: Tue Apr 5 15:42:45 2022 +0200
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Tue Apr 5 15:42:45 2022 +0200

    IOMMU/x86: use per-device page tables for quarantining
    
    Devices with RMRRs / unity mapped regions, due to it being unspecified
    how/when these memory regions may be accessed, may not be left
    disconnected from the mappings of these regions (as long as it's not
    certain that the device has been fully quiesced). Hence even the page
    tables used when quarantining such devices need to have mappings of
    those regions. This implies installing page tables in the first place
    even when not in scratch-page quarantining mode.
    
    This is CVE-2022-26361 / part of XSA-400.
    
    While for the purpose here it would be sufficient to have devices with
    RMRRs / unity mapped regions use per-device page tables, extend this to
    all devices (in scratch-page quarantining mode). This allows the leaf
    pages to be mapped r/w, thus covering also memory writes (rather than
    just reads) issued by non-quiescent devices.
    
    Set up quarantine page tables as late as possible, yet early enough to
    not encounter failure during de-assign. This means setup generally
    happens in assign_device(), while (for now) the one in deassign_device()
    is there mainly to be on the safe side.
    
    In VT-d's DID allocation function don't require the IOMMU lock to be
    held anymore: All involved code paths hold pcidevs_lock, so this way we
    avoid the need to acquire the IOMMU lock around the new call to
    context_set_domain_id().
    
    Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
    Reviewed-by: Paul Durrant <paul@xxxxxxx>
    Reviewed-by: Kevin Tian <kevin.tian@xxxxxxxxx>
    Reviewed-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
    master commit: 14dd241aad8af447680ac73e8579990e2c09c1e7
    master date: 2022-04-05 14:24:18 +0200
---
 xen/arch/x86/mm/p2m.c                         |   2 +-
 xen/drivers/passthrough/amd/iommu_map.c       | 168 +++++++++++++-----
 xen/drivers/passthrough/amd/pci_amd_iommu.c   |  39 ++--
 xen/drivers/passthrough/iommu.c               |  16 +-
 xen/drivers/passthrough/pci.c                 |  20 ++-
 xen/drivers/passthrough/vtd/iommu.c           | 247 +++++++++++++++++++-------
 xen/drivers/passthrough/vtd/iommu.h           |   2 +-
 xen/include/asm-x86/hvm/svm/amd-iommu-proto.h |   3 +-
 xen/include/asm-x86/pci.h                     |  13 ++
 xen/include/xen/iommu.h                       |   3 +-
 10 files changed, 378 insertions(+), 135 deletions(-)

diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 266b6a0416..c7bf160be9 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1424,7 +1424,7 @@ int set_identity_p2m_entry(struct domain *d, unsigned 
long gfn_l,
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     int ret;
 
-    if ( !paging_mode_translate(p2m->domain) )
+    if ( !paging_mode_translate(d) )
     {
         if ( !has_iommu_pt(d) )
             return 0;
diff --git a/xen/drivers/passthrough/amd/iommu_map.c 
b/xen/drivers/passthrough/amd/iommu_map.c
index 62bcfc5af7..9bdf9e25e4 100644
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -784,64 +784,150 @@ void amd_iommu_share_p2m(struct domain *d)
     }
 }
 
-int __init amd_iommu_quarantine_init(struct domain *d)
+static int fill_qpt(uint64_t *this, unsigned int level,
+                    struct page_info *pgs[IOMMU_MAX_PT_LEVELS],
+                    struct pci_dev *pdev)
 {
-    struct domain_iommu *hd = dom_iommu(d);
+    unsigned int i;
+    int rc = 0;
+
+    for ( i = 0; !rc && i < PTE_PER_TABLE_SIZE; ++i )
+    {
+        uint32_t *pte = (uint32_t *)&this[i];
+        uint64_t *next;
+
+        if ( !get_field_from_reg_u32(pte[0], IOMMU_PTE_PRESENT_MASK,
+                                     IOMMU_PTE_PRESENT_SHIFT) )
+        {
+            if ( !pgs[level] )
+            {
+                /*
+                 * The pgtable allocator is fine for the leaf page, as well as
+                 * page table pages, and the resulting allocations are always
+                 * zeroed.
+                 */
+                pgs[level] = alloc_amd_iommu_pgtable();
+                if ( !pgs[level] )
+                {
+                    rc = -ENOMEM;
+                    break;
+                }
+
+                page_list_add(pgs[level], &pdev->arch.pgtables_list);
+
+                if ( level )
+                {
+                    next = __map_domain_page(pgs[level]);
+                    rc = fill_qpt(next, level - 1, pgs, pdev);
+                    unmap_domain_page(next);
+                }
+            }
+
+            /*
+             * PDEs are essentially a subset of PTEs, so this function
+             * is fine to use even at the leaf.
+             */
+            set_iommu_pde_present(pte, mfn_x(page_to_mfn(pgs[level])), level,
+                                  true, true);
+        }
+        else if ( level &&
+                  get_field_from_reg_u32(pte[0],
+                                         IOMMU_PDE_NEXT_LEVEL_MASK,
+                                         IOMMU_PDE_NEXT_LEVEL_SHIFT) )
+        {
+            paddr_t addr_hi = get_field_from_reg_u32(pte[1],
+                                                     IOMMU_PTE_ADDR_HIGH_MASK,
+                                                     
IOMMU_PTE_ADDR_HIGH_SHIFT);
+            paddr_t addr_lo = get_field_from_reg_u32(pte[0],
+                                                     IOMMU_PTE_ADDR_LOW_MASK,
+                                                     IOMMU_PTE_ADDR_LOW_SHIFT);
+            unsigned long mfn = (addr_hi << (32 - PAGE_SHIFT)) | addr_lo;
+
+            page_list_add(mfn_to_page(_mfn(mfn)), &pdev->arch.pgtables_list);
+            next = map_domain_page(_mfn(mfn));
+            rc = fill_qpt(next, level - 1, pgs, pdev);
+            unmap_domain_page(next);
+        }
+    }
+
+    return rc;
+}
+
+int amd_iommu_quarantine_init(struct pci_dev *pdev)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
     unsigned long end_gfn =
         1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
     unsigned int level = amd_iommu_get_paging_mode(end_gfn);
-    uint64_t *table;
+    unsigned int req_id = get_dma_requestor_id(pdev->seg,
+                                               PCI_BDF2(pdev->bus, 
pdev->devfn));
+    const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    int rc;
 
-    if ( hd->arch.root_table )
+    ASSERT(pcidevs_locked());
+    ASSERT(!hd->arch.root_table);
+
+    ASSERT(pdev->arch.pseudo_domid != DOMID_INVALID);
+
+    if ( pdev->arch.amd.root_table )
     {
-        ASSERT_UNREACHABLE();
+        clear_domain_page(pdev->arch.leaf_mfn);
         return 0;
     }
 
-    spin_lock(&hd->arch.mapping_lock);
+    pdev->arch.amd.root_table = alloc_amd_iommu_pgtable();
+    if ( !pdev->arch.amd.root_table )
+        return -ENOMEM;
 
-    hd->arch.root_table = alloc_amd_iommu_pgtable();
-    if ( !hd->arch.root_table )
-        goto out;
+    /* Transiently install the root into DomIO, for iommu_identity_mapping(). 
*/
+    hd->arch.root_table = pdev->arch.amd.root_table;
 
-    table = __map_domain_page(hd->arch.root_table);
-    while ( level )
+    rc = amd_iommu_reserve_domain_unity_map(dom_io,
+                                            ivrs_mappings[req_id].unity_map,
+                                            0);
+
+    iommu_identity_map_teardown(dom_io);
+    hd->arch.root_table = NULL;
+
+    if ( rc )
+        printk("%04x:%02x:%02x.%u: quarantine unity mapping failed\n",
+               pdev->seg, pdev->bus,
+               PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+    else
     {
-        struct page_info *pg;
-        unsigned int i;
-
-        /*
-         * The pgtable allocator is fine for the leaf page, as well as
-         * page table pages, and the resulting allocations are always
-         * zeroed.
-         */
-        pg = alloc_amd_iommu_pgtable();
-        if ( !pg )
-            break;
-
-        for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ )
-        {
-            uint32_t *pde = (uint32_t *)&table[i];
+        uint64_t *root;
+        struct page_info *pgs[IOMMU_MAX_PT_LEVELS] = {};
 
-            /*
-             * PDEs are essentially a subset of PTEs, so this function
-             * is fine to use even at the leaf.
-             */
-            set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1,
-                                  false, true);
-        }
+        spin_lock(&hd->arch.mapping_lock);
 
-        unmap_domain_page(table);
-        table = __map_domain_page(pg);
-        level--;
+        root = __map_domain_page(pdev->arch.amd.root_table);
+        rc = fill_qpt(root, level - 1, pgs, pdev);
+        unmap_domain_page(root);
+
+        pdev->arch.leaf_mfn = page_to_mfn(pgs[0]);
+
+        spin_unlock(&hd->arch.mapping_lock);
     }
-    unmap_domain_page(table);
 
- out:
-    spin_unlock(&hd->arch.mapping_lock);
+    if ( rc )
+        amd_iommu_quarantine_teardown(pdev);
+
+    return rc;
+}
+
+void amd_iommu_quarantine_teardown(struct pci_dev *pdev)
+{
+    struct page_info *pg;
+
+    ASSERT(pcidevs_locked());
+
+    if ( !pdev->arch.amd.root_table )
+        return;
+
+    while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) )
+        free_amd_iommu_pgtable(pg);
 
-    /* Pages leaked in failure case */
-    return level ? -ENOMEM : 0;
+    pdev->arch.amd.root_table = NULL;
 }
 
 /*
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c 
b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index 301a8f1229..567472e48d 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -150,6 +150,8 @@ static int __must_check amd_iommu_setup_domain_device(
     u8 bus = pdev->bus;
     struct domain_iommu *hd = dom_iommu(domain);
     const struct ivrs_mappings *ivrs_dev;
+    const struct page_info *root_pg;
+    domid_t domid;
 
     BUG_ON(!hd->arch.paging_mode || !iommu->dev_table.buffer);
 
@@ -172,14 +174,25 @@ static int __must_check amd_iommu_setup_domain_device(
     dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
     ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
 
+    if ( domain != dom_io )
+    {
+        root_pg = hd->arch.root_table;
+        domid = domain->domain_id;
+    }
+    else
+    {
+        root_pg = pdev->arch.amd.root_table;
+        domid = pdev->arch.pseudo_domid;
+    }
+
     spin_lock_irqsave(&iommu->lock, flags);
 
     if ( !is_translation_valid((u32 *)dte) )
     {
         /* bind DTE to domain page-tables */
         rc = amd_iommu_set_root_page_table(
-                 dte, page_to_maddr(hd->arch.root_table),
-                 domain->domain_id, hd->arch.paging_mode, sr_flags);
+                 dte, page_to_maddr(root_pg), domid,
+                 hd->arch.paging_mode, sr_flags);
         if ( rc )
         {
             ASSERT(rc < 0);
@@ -193,8 +206,7 @@ static int __must_check amd_iommu_setup_domain_device(
 
         amd_iommu_flush_device(iommu, req_id);
     }
-    else if ( amd_iommu_get_root_page_table(dte) !=
-              page_to_maddr(hd->arch.root_table) )
+    else if ( amd_iommu_get_root_page_table(dte) != page_to_maddr(root_pg) )
     {
         /*
          * Strictly speaking if the device is the only one with this requestor
@@ -207,8 +219,8 @@ static int __must_check amd_iommu_setup_domain_device(
             rc = -EOPNOTSUPP;
         else
             rc = amd_iommu_set_root_page_table(
-                     dte, page_to_maddr(hd->arch.root_table),
-                     domain->domain_id, hd->arch.paging_mode, sr_flags);
+                     dte, page_to_maddr(root_pg), domid,
+                     hd->arch.paging_mode, sr_flags);
         if ( rc < 0 )
         {
             spin_unlock_irqrestore(&iommu->lock, flags);
@@ -227,6 +239,7 @@ static int __must_check amd_iommu_setup_domain_device(
               * intended anyway.
               */
              !pdev->domain->is_dying &&
+             pdev->domain != dom_io &&
              (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
               pdev->phantom_stride) )
             printk(" %04x:%02x:%02x.%u: reassignment may cause %pd data 
corruption\n",
@@ -247,9 +260,8 @@ static int __must_check amd_iommu_setup_domain_device(
     AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
                     "root table = %#"PRIx64", "
                     "domain = %d, paging mode = %d\n",
-                    req_id, pdev->type,
-                    page_to_maddr(hd->arch.root_table),
-                    domain->domain_id, hd->arch.paging_mode);
+                    req_id, pdev->type, page_to_maddr(root_pg),
+                    domid, hd->arch.paging_mode);
 
     ASSERT(pcidevs_locked());
 
@@ -296,7 +308,7 @@ int __init amd_iov_detect(void)
 
 int amd_iommu_alloc_root(struct domain_iommu *hd)
 {
-    if ( unlikely(!hd->arch.root_table) )
+    if ( unlikely(!hd->arch.root_table) && hd != dom_iommu(dom_io) )
     {
         hd->arch.root_table = alloc_amd_iommu_pgtable();
         if ( !hd->arch.root_table )
@@ -376,7 +388,10 @@ void amd_iommu_disable_domain_device(struct domain *domain,
 
         AMD_IOMMU_DEBUG("Disable: device id = %#x, "
                         "domain = %d, paging mode = %d\n",
-                        req_id,  domain->domain_id,
+                        req_id,
+                        get_field_from_reg_u32(((uint32_t *)dte)[2],
+                                               IOMMU_DEV_TABLE_DOMAIN_ID_MASK,
+                                               
IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT),
                         dom_iommu(domain)->arch.paging_mode);
     }
     spin_unlock_irqrestore(&iommu->lock, flags);
@@ -605,6 +620,8 @@ static int amd_iommu_remove_device(u8 devfn, struct pci_dev 
*pdev)
 
     amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
 
+    amd_iommu_quarantine_teardown(pdev);
+
     iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
     pdev->arch.pseudo_domid = DOMID_INVALID;
 
diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
index a747bfb946..f8af06c0fd 100644
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -497,19 +497,19 @@ int iommu_iotlb_flush_all(struct domain *d, unsigned int 
flush_flags)
     return rc;
 }
 
-static int __init iommu_quarantine_init(void)
+int iommu_quarantine_dev_init(device_t *dev)
 {
     const struct domain_iommu *hd = dom_iommu(dom_io);
-    int rc;
 
-    rc = iommu_domain_init(dom_io);
-    if ( rc )
-        return rc;
-
-    if ( !hd->platform_ops->quarantine_init )
+    if ( !iommu_quarantine || !hd->platform_ops->quarantine_init )
         return 0;
 
-    return hd->platform_ops->quarantine_init(dom_io);
+    return hd->platform_ops->quarantine_init(dev);
+}
+
+static int __init iommu_quarantine_init(void)
+{
+    return iommu_domain_init(dom_io);
 }
 
 int __init iommu_setup(void)
diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index f4d9777b52..768b900830 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -1512,6 +1512,13 @@ static int assign_device(struct domain *d, u16 seg, u8 
bus, u8 devfn, u32 flag)
         msixtbl_init(d);
     }
 
+    if ( pdev->domain != dom_io )
+    {
+        rc = iommu_quarantine_dev_init(pci_to_dev(pdev));
+        if ( rc )
+            goto done;
+    }
+
     pdev->fault.count = 0;
 
     if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), 
flag)) )
@@ -1558,9 +1565,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, 
u8 devfn)
         return -ENODEV;
 
     /* De-assignment from dom_io should de-quarantine the device */
-    target = ((pdev->quarantine || iommu_quarantine) &&
-              pdev->domain != dom_io) ?
-        dom_io : hardware_domain;
+    if ( (pdev->quarantine || iommu_quarantine) && pdev->domain != dom_io )
+    {
+        ret = iommu_quarantine_dev_init(pci_to_dev(pdev));
+        if ( ret )
+           return ret;
+
+        target = dom_io;
+    }
+    else
+        target = hardware_domain;
 
     while ( pdev->phantom_stride )
     {
diff --git a/xen/drivers/passthrough/vtd/iommu.c 
b/xen/drivers/passthrough/vtd/iommu.c
index b0918643b3..5ca1d02981 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -43,6 +43,12 @@
 #include "vtd.h"
 #include "../ats.h"
 
+#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \
+                                             : (pdev)->arch.pseudo_domid)
+#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \
+                                 ? dom_iommu(d)->arch.pgd_maddr \
+                                 : (pdev)->arch.vtd.pgd_maddr)
+
 /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
 bool __read_mostly untrusted_msi;
 
@@ -78,13 +84,18 @@ static int get_iommu_did(domid_t domid, const struct iommu 
*iommu,
 
 #define DID_FIELD_WIDTH 16
 #define DID_HIGH_OFFSET 8
+
+/*
+ * This function may have "context" passed as NULL, to merely obtain a DID
+ * for "domid".
+ */
 static int context_set_domain_id(struct context_entry *context,
                                  domid_t domid, struct iommu *iommu)
 {
     unsigned long nr_dom, i;
     int found = 0;
 
-    ASSERT(spin_is_locked(&iommu->lock));
+    ASSERT(pcidevs_locked());
 
     nr_dom = cap_ndoms(iommu->cap);
     i = find_first_bit(iommu->domid_bitmap, nr_dom);
@@ -110,8 +121,13 @@ static int context_set_domain_id(struct context_entry 
*context,
     }
 
     set_bit(i, iommu->domid_bitmap);
-    context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
-    context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
+
+    if ( context )
+    {
+        context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
+        context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
+    }
+
     return 0;
 }
 
@@ -179,8 +195,12 @@ static void check_cleanup_domid_map(struct domain *d,
                                     const struct pci_dev *exclude,
                                     struct iommu *iommu)
 {
-    bool found = any_pdev_behind_iommu(d, exclude, iommu);
+    bool found;
 
+    if ( d == dom_io )
+        return;
+
+    found = any_pdev_behind_iommu(d, exclude, iommu);
     /*
      * Hidden devices are associated with DomXEN but usable by the hardware
      * domain. Hence they need considering here as well.
@@ -1443,7 +1463,7 @@ int domain_context_mapping_one(
         domid = iommu->domid_map[prev_did];
         if ( domid < DOMID_FIRST_RESERVED )
             prev_dom = rcu_lock_domain_by_id(domid);
-        else if ( domid == DOMID_IO )
+        else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK 
)
             prev_dom = rcu_lock_domain(dom_io);
         if ( !prev_dom )
         {
@@ -1620,15 +1640,12 @@ int domain_context_mapping_one(
     {
         if ( !prev_dom )
             domain_context_unmap_one(domain, iommu, bus, devfn,
-                                     domain->domain_id);
+                                     DEVICE_DOMID(domain, pdev));
         else if ( prev_dom != domain ) /* Avoid infinite recursion. */
-        {
-            hd = dom_iommu(prev_dom);
             domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
-                                       domain->domain_id,
-                                       hd->arch.pgd_maddr,
+                                       DEVICE_DOMID(prev_dom, pdev),
+                                       DEVICE_PGTABLE(prev_dom, pdev),
                                        mode & MAP_WITH_RMRR);
-        }
     }
 
     if ( prev_dom )
@@ -1645,7 +1662,7 @@ static int domain_context_mapping(struct domain *domain, 
u8 devfn,
 {
     struct acpi_drhd_unit *drhd;
     const struct acpi_rmrr_unit *rmrr;
-    paddr_t pgd_maddr = dom_iommu(domain)->arch.pgd_maddr;
+    paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev);
     domid_t orig_domid = pdev->arch.pseudo_domid;
     int ret = 0;
     unsigned int i, mode = 0;
@@ -1668,7 +1685,7 @@ static int domain_context_mapping(struct domain *domain, 
u8 devfn,
         break;
     }
 
-    if ( domain != pdev->domain )
+    if ( domain != pdev->domain && pdev->domain != dom_io )
     {
         if ( pdev->domain->is_dying )
             mode |= MAP_OWNER_DYING;
@@ -1709,8 +1726,8 @@ static int domain_context_mapping(struct domain *domain, 
u8 devfn,
             printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
                    domain->domain_id, seg, bus,
                    PCI_SLOT(devfn), PCI_FUNC(devfn));
-        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev, domain->domain_id, pgd_maddr,
+        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev,
+                                         DEVICE_DOMID(domain, pdev), pgd_maddr,
                                          mode);
         if ( ret > 0 )
             ret = 0;
@@ -1734,8 +1751,8 @@ static int domain_context_mapping(struct domain *domain, 
u8 devfn,
                    PCI_SLOT(devfn), PCI_FUNC(devfn));
 
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev, domain->domain_id, pgd_maddr,
-                                         mode);
+                                         pdev, DEVICE_DOMID(domain, pdev),
+                                         pgd_maddr, mode);
         if ( ret < 0 )
             break;
         prev_present = ret;
@@ -1761,8 +1778,8 @@ static int domain_context_mapping(struct domain *domain, 
u8 devfn,
          */
         if ( ret >= 0 )
             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                             NULL, domain->domain_id, 
pgd_maddr,
-                                             mode);
+                                             NULL, DEVICE_DOMID(domain, pdev),
+                                             pgd_maddr, mode);
 
         /*
          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
@@ -1777,8 +1794,8 @@ static int domain_context_mapping(struct domain *domain, 
u8 devfn,
         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
              (secbus != pdev->bus || pdev->devfn != 0) )
             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
-                                             NULL, domain->domain_id, 
pgd_maddr,
-                                             mode);
+                                             NULL, DEVICE_DOMID(domain, pdev),
+                                             pgd_maddr, mode);
 
         if ( ret )
         {
@@ -1914,7 +1931,7 @@ static const struct acpi_drhd_unit *domain_context_unmap(
                    domain->domain_id, seg, bus,
                    PCI_SLOT(devfn), PCI_FUNC(devfn));
         ret = domain_context_unmap_one(domain, iommu, bus, devfn,
-                                       domain->domain_id);
+                                       DEVICE_DOMID(domain, pdev));
         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
             disable_ats_device(pdev);
 
@@ -1925,7 +1942,7 @@ static const struct acpi_drhd_unit *domain_context_unmap(
             printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
                    domain->domain_id, seg, bus, PCI_SLOT(devfn), 
PCI_FUNC(devfn));
         ret = domain_context_unmap_one(domain, iommu, bus, devfn,
-                                       domain->domain_id);
+                                       DEVICE_DOMID(domain, pdev));
         if ( ret )
             break;
 
@@ -1934,18 +1951,12 @@ static const struct acpi_drhd_unit 
*domain_context_unmap(
         if ( find_upstream_bridge(seg, &tmp_bus, &tmp_devfn, &secbus) < 1 )
             break;
 
+        ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+                                       DEVICE_DOMID(domain, pdev));
         /* PCIe to PCI/PCIx bridge */
-        if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
-        {
-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
-                                           domain->domain_id);
-            if ( !ret )
-                ret = domain_context_unmap_one(domain, iommu, secbus, 0,
-                                               domain->domain_id);
-        }
-        else /* Legacy PCI bridge */
-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
-                                           domain->domain_id);
+        if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == 
DEV_TYPE_PCIe2PCI_BRIDGE )
+            ret = domain_context_unmap_one(domain, iommu, secbus, 0,
+                                           DEVICE_DOMID(domain, pdev));
 
         break;
 
@@ -1992,6 +2003,25 @@ static void iommu_domain_teardown(struct domain *d)
     spin_unlock(&hd->arch.mapping_lock);
 }
 
+static void quarantine_teardown(struct pci_dev *pdev,
+                                const struct acpi_drhd_unit *drhd)
+{
+    struct page_info *pg;
+
+    ASSERT(pcidevs_locked());
+
+    if ( !pdev->arch.vtd.pgd_maddr )
+        return;
+
+    while ( (pg = page_list_remove_head(&pdev->arch.pgtables_list)) )
+        free_domheap_page(pg);
+
+    pdev->arch.vtd.pgd_maddr = 0;
+
+    if ( drhd )
+        cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu);
+}
+
 static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
                                              mfn_t mfn, unsigned int flags,
                                              unsigned int *flush_flags)
@@ -2214,6 +2244,8 @@ static int intel_iommu_remove_device(u8 devfn, struct 
pci_dev *pdev)
                                rmrr->end_address, 0);
     }
 
+    quarantine_teardown(pdev, drhd);
+
     if ( drhd )
     {
         iommu_free_domid(pdev->arch.pseudo_domid,
@@ -2852,60 +2884,139 @@ static void vtd_dump_p2m_table(struct domain *d)
     vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 
0, 0);
 }
 
-static int __init intel_iommu_quarantine_init(struct domain *d)
+static int fill_qpt(struct dma_pte *this, unsigned int level,
+                    paddr_t maddrs[6], struct pci_dev *pdev)
 {
-    struct domain_iommu *hd = dom_iommu(d);
-    struct dma_pte *parent;
+    unsigned int i;
+    int rc = 0;
+
+    for ( i = 0; !rc && i < PTE_NUM; ++i )
+    {
+        struct dma_pte *pte = &this[i], *next;
+
+        if ( !dma_pte_present(*pte) )
+        {
+            if ( !maddrs[level] )
+            {
+                /*
+                 * The pgtable allocator is fine for the leaf page, as well as
+                 * page table pages, and the resulting allocations are always
+                 * zeroed.
+                 */
+                maddrs[level] = alloc_pgtable_maddr(NULL, 1);
+                if ( !maddrs[level] )
+                {
+                    rc = -ENOMEM;
+                    break;
+                }
+
+                page_list_add(maddr_to_page(maddrs[level]),
+                              &pdev->arch.pgtables_list);
+
+                if ( level )
+                {
+                    next = map_vtd_domain_page(maddrs[level]);
+                    rc = fill_qpt(next, level - 1, maddrs, pdev);
+                    unmap_vtd_domain_page(next);
+                }
+            }
+
+            dma_set_pte_addr(*pte, maddrs[level]);
+            dma_set_pte_readable(*pte);
+            dma_set_pte_writable(*pte);
+        }
+        else if ( level && !dma_pte_superpage(*pte) )
+        {
+            page_list_add(maddr_to_page(dma_pte_addr(*pte)),
+                          &pdev->arch.pgtables_list);
+            next = map_vtd_domain_page(dma_pte_addr(*pte));
+            rc = fill_qpt(next, level - 1, maddrs, pdev);
+            unmap_vtd_domain_page(next);
+        }
+    }
+
+    return rc;
+}
+
+static int intel_iommu_quarantine_init(struct pci_dev *pdev)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
+    paddr_t maddr;
     unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
     unsigned int level = agaw_to_level(agaw);
+    const struct acpi_drhd_unit *drhd;
+    const struct acpi_rmrr_unit *rmrr;
+    unsigned int i, bdf;
+    bool rmrr_found = false;
+    int rc;
 
-    if ( hd->arch.pgd_maddr )
+    ASSERT(pcidevs_locked());
+    ASSERT(!hd->arch.pgd_maddr);
+
+    if ( pdev->arch.vtd.pgd_maddr )
     {
-        ASSERT_UNREACHABLE();
+        clear_domain_page(pdev->arch.leaf_mfn);
         return 0;
     }
 
-    spin_lock(&hd->arch.mapping_lock);
+    drhd = acpi_find_matched_drhd_unit(pdev);
+    if ( !drhd )
+        return -ENODEV;
 
-    hd->arch.pgd_maddr = alloc_pgtable_maddr(NULL, 1);
-    if ( !hd->arch.pgd_maddr )
-        goto out;
+    maddr = alloc_pgtable_maddr(NULL, 1);
+    if ( !maddr )
+        return -ENOMEM;
 
-    parent = map_vtd_domain_page(hd->arch.pgd_maddr);
-    while ( level )
-    {
-        uint64_t maddr;
-        unsigned int offset;
+    rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu);
 
-        /*
-         * The pgtable allocator is fine for the leaf page, as well as
-         * page table pages, and the resulting allocations are always
-         * zeroed.
-         */
-        maddr = alloc_pgtable_maddr(NULL, 1);
-        if ( !maddr )
+    /* Transiently install the root into DomIO, for iommu_identity_mapping(). 
*/
+    hd->arch.pgd_maddr = maddr;
+
+    for_each_rmrr_device ( rmrr, bdf, i )
+    {
+        if ( rc )
             break;
 
-        for ( offset = 0; offset < PTE_NUM; offset++ )
+        if ( rmrr->segment == pdev->seg &&
+             bdf == PCI_BDF2(pdev->bus, pdev->devfn) )
         {
-            struct dma_pte *pte = &parent[offset];
+            rmrr_found = true;
 
-            dma_set_pte_addr(*pte, maddr);
-            dma_set_pte_readable(*pte);
+            rc = iommu_identity_mapping(dom_io, p2m_access_rw,
+                                        rmrr->base_address, rmrr->end_address,
+                                        0);
+            if ( rc )
+                printk(XENLOG_ERR VTDPREFIX
+                       "%04x:%02x:%02x.%u: RMRR quarantine mapping failed\n",
+                       pdev->seg, pdev->bus,
+                       PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
         }
-        iommu_sync_cache(parent, PAGE_SIZE);
+    }
 
-        unmap_vtd_domain_page(parent);
-        parent = map_vtd_domain_page(maddr);
-        level--;
+    iommu_identity_map_teardown(dom_io);
+    hd->arch.pgd_maddr = 0;
+    pdev->arch.vtd.pgd_maddr = maddr;
+
+    if ( !rc )
+    {
+        struct dma_pte *root;
+        paddr_t maddrs[6] = {};
+
+        spin_lock(&hd->arch.mapping_lock);
+
+        root = map_vtd_domain_page(maddr);
+        rc = fill_qpt(root, level - 1, maddrs, pdev);
+        unmap_vtd_domain_page(root);
+
+        pdev->arch.leaf_mfn = maddr_to_mfn(maddrs[0]);
+
+        spin_unlock(&hd->arch.mapping_lock);
     }
-    unmap_vtd_domain_page(parent);
 
- out:
-    spin_unlock(&hd->arch.mapping_lock);
+    if ( rc )
+        quarantine_teardown(pdev, drhd);
 
-    /* Pages leaked in failure case */
-    return level ? -ENOMEM : 0;
+    return rc;
 }
 
 const struct iommu_ops __initconstrel intel_iommu_ops = {
diff --git a/xen/drivers/passthrough/vtd/iommu.h 
b/xen/drivers/passthrough/vtd/iommu.h
index e8346e29b6..4fecb9802c 100644
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -535,7 +535,7 @@ struct iommu {
     u32 nr_pt_levels;
     u64        cap;
     u64        ecap;
-    spinlock_t lock; /* protect context, domain ids */
+    spinlock_t lock; /* protect context */
     spinlock_t register_lock; /* protect iommu register handling */
     u64 root_maddr; /* root entry machine address */
     struct msi_desc msi;
diff --git a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h 
b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
index 35b7d9eb23..453d8047e1 100644
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -51,7 +51,8 @@ void get_iommu_features(struct amd_iommu *iommu);
 int amd_iommu_init(void);
 int amd_iommu_update_ivrs_mapping_acpi(void);
 
-int amd_iommu_quarantine_init(struct domain *d);
+int amd_iommu_quarantine_init(struct pci_dev *pdev);
+void amd_iommu_quarantine_teardown(struct pci_dev *pdev);
 
 /* mapping functions */
 int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn,
diff --git a/xen/include/asm-x86/pci.h b/xen/include/asm-x86/pci.h
index 70ed48e309..0c79acb1ed 100644
--- a/xen/include/asm-x86/pci.h
+++ b/xen/include/asm-x86/pci.h
@@ -1,6 +1,8 @@
 #ifndef __X86_PCI_H__
 #define __X86_PCI_H__
 
+#include <xen/mm.h>
+
 #define CF8_BDF(cf8)     (  ((cf8) & 0x00ffff00) >> 8)
 #define CF8_ADDR_LO(cf8) (   (cf8) & 0x000000fc)
 #define CF8_ADDR_HI(cf8) (  ((cf8) & 0x0f000000) >> 16)
@@ -20,7 +22,18 @@ struct arch_pci_dev {
      * them don't race (de)initialization and hence don't strictly need any
      * locking.
      */
+    union {
+        /* Subset of struct arch_iommu's fields, to be used in dom_io. */
+        struct {
+            uint64_t pgd_maddr;
+        } vtd;
+        struct {
+            struct page_info *root_table;
+        } amd;
+    };
     domid_t pseudo_domid;
+    mfn_t leaf_mfn;
+    struct page_list_head pgtables_list;
 };
 
 int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
index 262679485d..e2b6ca4e1f 100644
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -189,7 +189,7 @@ typedef int iommu_grdm_t(xen_pfn_t start, xen_ulong_t nr, 
u32 id, void *ctxt);
 struct iommu_ops {
     int (*init)(struct domain *d);
     void (*hwdom_init)(struct domain *d);
-    int (*quarantine_init)(struct domain *d);
+    int (*quarantine_init)(device_t *dev);
     int (*add_device)(u8 devfn, device_t *dev);
     int (*enable_device)(device_t *dev);
     int (*remove_device)(u8 devfn, device_t *dev);
@@ -277,6 +277,7 @@ int __must_check iommu_suspend(void);
 void iommu_resume(void);
 void iommu_crash_shutdown(void);
 int iommu_get_reserved_device_memory(iommu_grdm_t *, void *);
+int iommu_quarantine_dev_init(device_t *dev);
 
 void iommu_share_p2m_table(struct domain *d);
 
--
generated by git-patchbot for /home/xen/git/xen.git#stable-4.12



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.