[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen staging] AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables



commit b4f042236ae0bb6725b3e8dd40af5a2466a6f971
Author:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
AuthorDate: Wed Dec 11 14:55:32 2019 +0100
Commit:     Jan Beulich <jbeulich@xxxxxxxx>
CommitDate: Wed Dec 11 14:55:32 2019 +0100

    AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables
    
    update_paging_mode() has multiple bugs:
    
     1) Booting with iommu=debug will cause it to inform you that that it called
        without the pdev_list lock held.
     2) When growing by more than a single level, it leaks the newly allocated
        table(s) in the case of a further error.
    
    Furthermore, the choice of default level for a domain has issues:
    
     1) All HVM guests grow from 2 to 3 levels during construction because of 
the
        position of the VRAM just below the 4G boundary, so defaulting to 2 is a
        waste of effort.
     2) The limit for PV guests doesn't take memory hotplug into account, and
        isn't dynamic at runtime like HVM guests.  This means that a PV guest 
may
        get RAM which it can't map in the IOMMU.
    
    The dynamic height is a property unique to AMD, and adds a substantial
    quantity of complexity for what is a marginal performance improvement.  
Remove
    the complexity by removing the dynamic height.
    
    PV guests now get 3 or 4 levels based on any hotplug regions in the host.
    This only makes a difference for hardware which previously had all RAM below
    the 512G boundary, and a hotplug region above.
    
    HVM guests now get 4 levels (which will be sufficient until 256TB guests
    become a thing), because we don't currently have the information to know 
when
    3 would be safe to use.
    
    The overhead of this extra level is not expected to be noticeable.  It costs
    one page (4k) per domain, and one extra IO-TLB paging structure cache entry
    which is very hot and less likely to be evicted.
    
    This is XSA-311.
    
    Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
    Acked-by: Jan Beulich <jbeulich@xxxxxxxx>
---
 xen/drivers/passthrough/amd/iommu_map.c     | 108 ----------------------------
 xen/drivers/passthrough/amd/pci_amd_iommu.c |  15 ++--
 2 files changed, 11 insertions(+), 112 deletions(-)

diff --git a/xen/drivers/passthrough/amd/iommu_map.c 
b/xen/drivers/passthrough/amd/iommu_map.c
index 54e1d132d9..4e041b960f 100644
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -285,100 +285,6 @@ static int iommu_pde_from_dfn(struct domain *d, unsigned 
long dfn,
     return 0;
 }
 
-static int update_paging_mode(struct domain *d, unsigned long dfn)
-{
-    uint16_t bdf;
-    struct amd_iommu_dte *table, *dte;
-    unsigned int req_id, level, offset;
-    unsigned long flags;
-    struct pci_dev *pdev;
-    struct amd_iommu *iommu = NULL;
-    struct page_info *new_root = NULL;
-    struct page_info *old_root = NULL;
-    struct amd_iommu_pte *new_root_vaddr;
-    unsigned long old_root_mfn;
-    struct domain_iommu *hd = dom_iommu(d);
-
-    if ( dfn == dfn_x(INVALID_DFN) )
-        return -EADDRNOTAVAIL;
-    ASSERT(!(dfn >> DEFAULT_DOMAIN_ADDRESS_WIDTH));
-
-    level = hd->arch.paging_mode;
-    old_root = hd->arch.root_table;
-    offset = dfn >> (PTE_PER_TABLE_SHIFT * (level - 1));
-
-    ASSERT(spin_is_locked(&hd->arch.mapping_lock) && is_hvm_domain(d));
-
-    while ( offset >= PTE_PER_TABLE_SIZE )
-    {
-        /* Allocate and install a new root table.
-         * Only upper I/O page table grows, no need to fix next level bits */
-        new_root = alloc_amd_iommu_pgtable();
-        if ( new_root == NULL )
-        {
-            AMD_IOMMU_DEBUG("%s Cannot allocate I/O page table\n",
-                            __func__);
-            return -ENOMEM;
-        }
-
-        new_root_vaddr = __map_domain_page(new_root);
-        old_root_mfn = mfn_x(page_to_mfn(old_root));
-        set_iommu_pde_present(new_root_vaddr, old_root_mfn, level,
-                              true, true);
-        level++;
-        old_root = new_root;
-        offset >>= PTE_PER_TABLE_SHIFT;
-        unmap_domain_page(new_root_vaddr);
-    }
-
-    if ( new_root != NULL )
-    {
-        hd->arch.paging_mode = level;
-        hd->arch.root_table = new_root;
-
-        if ( !pcidevs_locked() )
-            AMD_IOMMU_DEBUG("%s Try to access pdev_list "
-                            "without aquiring pcidevs_lock.\n", __func__);
-
-        /* Update device table entries using new root table and paging mode */
-        for_each_pdev( d, pdev )
-        {
-            if ( pdev->type == DEV_TYPE_PCI_HOST_BRIDGE )
-                continue;
-
-            bdf = PCI_BDF2(pdev->bus, pdev->devfn);
-            iommu = find_iommu_for_device(pdev->seg, bdf);
-            if ( !iommu )
-            {
-                AMD_IOMMU_DEBUG("%s Fail to find iommu.\n", __func__);
-                return -ENODEV;
-            }
-
-            spin_lock_irqsave(&iommu->lock, flags);
-            do {
-                req_id = get_dma_requestor_id(pdev->seg, bdf);
-                table = iommu->dev_table.buffer;
-                dte = &table[req_id];
-
-                /* valid = 0 only works for dom0 passthrough mode */
-                amd_iommu_set_root_page_table(dte,
-                                              
page_to_maddr(hd->arch.root_table),
-                                              d->domain_id,
-                                              hd->arch.paging_mode, 1);
-
-                amd_iommu_flush_device(iommu, req_id);
-                bdf += pdev->phantom_stride;
-            } while ( PCI_DEVFN2(bdf) != pdev->devfn &&
-                      PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
-            spin_unlock_irqrestore(&iommu->lock, flags);
-        }
-
-        /* For safety, invalidate all entries */
-        amd_iommu_flush_all_pages(d);
-    }
-    return 0;
-}
-
 int amd_iommu_map_page(struct domain *d, dfn_t dfn, mfn_t mfn,
                        unsigned int flags, unsigned int *flush_flags)
 {
@@ -400,20 +306,6 @@ int amd_iommu_map_page(struct domain *d, dfn_t dfn, mfn_t 
mfn,
         return rc;
     }
 
-    /* Since HVM domain is initialized with 2 level IO page table,
-     * we might need a deeper page table for wider dfn now */
-    if ( is_hvm_domain(d) )
-    {
-        if ( update_paging_mode(d, dfn_x(dfn)) )
-        {
-            spin_unlock(&hd->arch.mapping_lock);
-            AMD_IOMMU_DEBUG("Update page mode failed dfn = %"PRI_dfn"\n",
-                            dfn_x(dfn));
-            domain_crash(d);
-            return -EFAULT;
-        }
-    }
-
     if ( iommu_pde_from_dfn(d, dfn_x(dfn), pt_mfn, true) || (pt_mfn[1] == 0) )
     {
         spin_unlock(&hd->arch.mapping_lock);
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c 
b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index 4da6518773..dd3401f0dc 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -251,10 +251,17 @@ static int amd_iommu_domain_init(struct domain *d)
 {
     struct domain_iommu *hd = dom_iommu(d);
 
-    /* For pv and dom0, stick with get_paging_mode(max_page)
-     * For HVM dom0, use 2 level page table at first */
-    hd->arch.paging_mode = is_hvm_domain(d) ?
-        2 : amd_iommu_get_paging_mode(max_page);
+    /*
+     * Choose the number of levels for the IOMMU page tables.
+     * - PV needs 3 or 4, depending on whether there is RAM (including hotplug
+     *   RAM) above the 512G boundary.
+     * - HVM could in principle use 3 or 4 depending on how much guest
+     *   physical address space we give it, but this isn't known yet so use 4
+     *   unilaterally.
+     */
+    hd->arch.paging_mode = is_hvm_domain(d)
+        ? 4 : amd_iommu_get_paging_mode(get_upper_mfn_bound());
+
     return 0;
 }
 
--
generated by git-patchbot for /home/xen/git/xen.git#staging

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.