[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [xen stable-4.13] VT-d: re-assign devices directly
commit 235aa158e0f71ee2bf20155ce6b0b429acf59d37 Author: Jan Beulich <jbeulich@xxxxxxxx> AuthorDate: Tue Apr 5 15:23:57 2022 +0200 Commit: Jan Beulich <jbeulich@xxxxxxxx> CommitDate: Tue Apr 5 15:23:57 2022 +0200 VT-d: re-assign devices directly Devices with RMRRs, due to it being unspecified how/when the specified memory regions may get accessed, may not be left disconnected from their respective mappings (as long as it's not certain that the device has been fully quiesced). Hence rather than unmapping the old context and then mapping the new one, re-assignment needs to be done in a single step. This is CVE-2022-26359 / part of XSA-400. Reported-by: Roger Pau Monné <roger.pau@xxxxxxxxxx> Similarly quarantining scratch-page mode relies on page tables to be continuously wired up. To avoid complicating things more than necessary, treat all devices mostly equally, i.e. regardless of their association with any RMRRs. The main difference is when it comes to updating context entries, which need to be atomic when there are RMRRs. Yet atomicity can only be achieved with CMPXCHG16B, availability of which we can't take for given. The seemingly complicated choice of non-negative return values for domain_context_mapping_one() is to limit code churn: This way callers passing NULL for pdev don't need fiddling with. Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Reviewed-by: Kevin Tian <kevin.tian@xxxxxxxxx> Reviewed-by: Roger Pau Monné <roger.pau@xxxxxxxxxx> master commit: 8f41e481b4852173909363b88c1ab3da747d3a05 master date: 2022-04-05 14:17:42 +0200 --- xen/drivers/passthrough/vtd/extern.h | 7 +- xen/drivers/passthrough/vtd/iommu.c | 268 +++++++++++++++++++++++++++-------- xen/drivers/passthrough/vtd/iommu.h | 8 +- xen/drivers/passthrough/vtd/quirks.c | 14 +- xen/drivers/passthrough/vtd/vtd.h | 10 +- 5 files changed, 233 insertions(+), 74 deletions(-) diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h index 1cac22a02f..f51f8aae0d 100644 --- a/xen/drivers/passthrough/vtd/extern.h +++ b/xen/drivers/passthrough/vtd/extern.h @@ -85,7 +85,8 @@ void free_pgtable_maddr(u64 maddr); void *map_vtd_domain_page(u64 maddr); void unmap_vtd_domain_page(void *va); int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, - u8 bus, u8 devfn, const struct pci_dev *); + uint8_t bus, uint8_t devfn, + const struct pci_dev *pdev, unsigned int mode); int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, u8 bus, u8 devfn); int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); @@ -105,8 +106,8 @@ int is_igd_vt_enabled_quirk(void); void platform_quirks_init(void); void vtd_ops_preamble_quirk(struct vtd_iommu *iommu); void vtd_ops_postamble_quirk(struct vtd_iommu *iommu); -int __must_check me_wifi_quirk(struct domain *domain, - u8 bus, u8 devfn, int map); +int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus, + uint8_t devfn, unsigned int mode); void pci_vtd_quirk(const struct pci_dev *); void quirk_iommu_caps(struct vtd_iommu *iommu); diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c index b729ae173a..17deda92d8 100644 --- a/xen/drivers/passthrough/vtd/iommu.c +++ b/xen/drivers/passthrough/vtd/iommu.c @@ -110,6 +110,7 @@ static int context_set_domain_id(struct context_entry *context, } set_bit(i, iommu->domid_bitmap); + context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET); context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET; return 0; } @@ -1350,15 +1351,27 @@ static void __hwdom_init intel_iommu_hwdom_init(struct domain *d) } } +/* + * This function returns + * - a negative errno value upon error, + * - zero upon success when previously the entry was non-present, or this isn't + * the "main" request for a device (pdev == NULL), or for no-op quarantining + * assignments, + * - positive (one) upon success when previously the entry was present and this + * is the "main" request for a device (pdev != NULL). + */ int domain_context_mapping_one( struct domain *domain, struct vtd_iommu *iommu, - u8 bus, u8 devfn, const struct pci_dev *pdev) + uint8_t bus, uint8_t devfn, const struct pci_dev *pdev, + unsigned int mode) { struct domain_iommu *hd = dom_iommu(domain); - struct context_entry *context, *context_entries; + struct context_entry *context, *context_entries, lctxt; + __uint128_t old; u64 maddr, pgd_maddr; - u16 seg = iommu->drhd->segment; + uint16_t seg = iommu->drhd->segment, prev_did = 0; + struct domain *prev_dom = NULL; int agaw, rc, ret; bool_t flush_dev_iotlb; @@ -1367,17 +1380,32 @@ int domain_context_mapping_one( maddr = bus_to_context_maddr(iommu, bus); context_entries = (struct context_entry *)map_vtd_domain_page(maddr); context = &context_entries[devfn]; + old = (lctxt = *context).full; - if ( context_present(*context) ) + if ( context_present(lctxt) ) { - spin_unlock(&iommu->lock); - unmap_vtd_domain_page(context_entries); - return 0; + domid_t domid; + + prev_did = context_domain_id(lctxt); + domid = iommu->domid_map[prev_did]; + if ( domid < DOMID_FIRST_RESERVED ) + prev_dom = rcu_lock_domain_by_id(domid); + else if ( domid == DOMID_IO ) + prev_dom = rcu_lock_domain(dom_io); + if ( !prev_dom ) + { + spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); + dprintk(XENLOG_DEBUG VTDPREFIX, + "no domain for did %u (nr_dom %u)\n", + prev_did, cap_ndoms(iommu->cap)); + return -ESRCH; + } } if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) { - context_set_translation_type(*context, CONTEXT_TT_PASS_THRU); + context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU); agaw = level_to_agaw(iommu->nr_pt_levels); } else @@ -1394,6 +1422,8 @@ int domain_context_mapping_one( spin_unlock(&hd->arch.mapping_lock); spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); + if ( prev_dom ) + rcu_unlock_domain(prev_dom); return -ENOMEM; } } @@ -1411,33 +1441,102 @@ int domain_context_mapping_one( goto nomem; } - context_set_address_root(*context, pgd_maddr); + context_set_address_root(lctxt, pgd_maddr); if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) ) - context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB); + context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB); else - context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL); + context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL); spin_unlock(&hd->arch.mapping_lock); } - if ( context_set_domain_id(context, domain, iommu) ) + rc = context_set_domain_id(&lctxt, domain, iommu); + if ( rc ) { + unlock: spin_unlock(&iommu->lock); unmap_vtd_domain_page(context_entries); - return -EFAULT; + if ( prev_dom ) + rcu_unlock_domain(prev_dom); + return rc; + } + + if ( !prev_dom ) + { + context_set_address_width(lctxt, agaw); + context_set_fault_enable(lctxt); + context_set_present(lctxt); + } + else if ( prev_dom == domain ) + { + ASSERT(lctxt.full == context->full); + rc = !!pdev; + goto unlock; + } + else + { + ASSERT(context_address_width(lctxt) == agaw); + ASSERT(!context_fault_disable(lctxt)); + } + + if ( cpu_has_cx16 ) + { + __uint128_t res = cmpxchg16b(context, &old, &lctxt.full); + + /* + * Hardware does not update the context entry behind our backs, + * so the return value should match "old". + */ + if ( res != old ) + { + if ( pdev ) + check_cleanup_domid_map(domain, pdev, iommu); + printk(XENLOG_ERR + "%04x:%02x:%02x.%u: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", + pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + (uint64_t)(res >> 64), (uint64_t)res, + (uint64_t)(old >> 64), (uint64_t)old); + rc = -EILSEQ; + goto unlock; + } + } + else if ( !prev_dom || !(mode & MAP_WITH_RMRR) ) + { + context_clear_present(*context); + iommu_sync_cache(context, sizeof(*context)); + + write_atomic(&context->hi, lctxt.hi); + /* No barrier should be needed between these two. */ + write_atomic(&context->lo, lctxt.lo); + } + else /* Best effort, updating DID last. */ + { + /* + * By non-atomically updating the context entry's DID field last, + * during a short window in time TLB entries with the old domain ID + * but the new page tables may be inserted. This could affect I/O + * of other devices using this same (old) domain ID. Such updating + * therefore is not a problem if this was the only device associated + * with the old domain ID. Diverting I/O of any of a dying domain's + * devices to the quarantine page tables is intended anyway. + */ + if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) ) + printk(XENLOG_WARNING VTDPREFIX + " %04x:%02x:%02x.%u: reassignment may cause %pd data corruption\n", + seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), prev_dom); + + write_atomic(&context->lo, lctxt.lo); + /* No barrier should be needed between these two. */ + write_atomic(&context->hi, lctxt.hi); } - context_set_address_width(*context, agaw); - context_set_fault_enable(*context); - context_set_present(*context); iommu_sync_cache(context, sizeof(struct context_entry)); spin_unlock(&iommu->lock); - /* Context entry was previously non-present (with domid 0). */ - rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn), - DMA_CCMD_MASK_NOBIT, 1); + rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF2(bus, devfn), + DMA_CCMD_MASK_NOBIT, !prev_dom); flush_dev_iotlb = !!find_ats_dev_drhd(iommu); - ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb); + ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb); /* * The current logic for returns: @@ -1458,12 +1557,21 @@ int domain_context_mapping_one( unmap_vtd_domain_page(context_entries); if ( !seg && !rc ) - rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC); + rc = me_wifi_quirk(domain, bus, devfn, mode); if ( rc ) - domain_context_unmap_one(domain, iommu, bus, devfn); + { + if ( !prev_dom ) + domain_context_unmap_one(domain, iommu, bus, devfn); + else if ( prev_dom != domain ) /* Avoid infinite recursion. */ + domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, + mode & MAP_WITH_RMRR); + } - return rc; + if ( prev_dom ) + rcu_unlock_domain(prev_dom); + + return rc ?: pdev && prev_dom; } static int domain_context_unmap(struct domain *d, uint8_t devfn, @@ -1473,8 +1581,11 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, struct pci_dev *pdev) { struct acpi_drhd_unit *drhd; + const struct acpi_rmrr_unit *rmrr; int ret = 0; - u8 seg = pdev->seg, bus = pdev->bus, secbus; + unsigned int i, mode = 0; + uint16_t seg = pdev->seg, bdf; + uint8_t bus = pdev->bus, secbus; drhd = acpi_find_matched_drhd_unit(pdev); if ( !drhd ) @@ -1493,8 +1604,29 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, ASSERT(pcidevs_locked()); + for_each_rmrr_device( rmrr, bdf, i ) + { + if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf ) + continue; + + mode |= MAP_WITH_RMRR; + break; + } + + if ( domain != pdev->domain ) + { + if ( pdev->domain->is_dying ) + mode |= MAP_OWNER_DYING; + else if ( drhd && + !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) && + !pdev->phantom_stride ) + mode |= MAP_SINGLE_DEVICE; + } + switch ( pdev->type ) { + bool prev_present; + case DEV_TYPE_PCI_HOST_BRIDGE: if ( iommu_debug ) printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n", @@ -1515,7 +1647,9 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pdev); + pdev, mode); + if ( ret > 0 ) + ret = 0; if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) enable_ats_device(pdev, &drhd->iommu->ats_devices); @@ -1528,9 +1662,10 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, PCI_SLOT(devfn), PCI_FUNC(devfn)); ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - pdev); - if ( ret ) + pdev, mode); + if ( ret < 0 ) break; + prev_present = ret; if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 ) { @@ -1538,6 +1673,15 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, break; ret = -ENXIO; } + /* + * Strictly speaking if the device is the only one behind this bridge + * and the only one with this (secbus,0,0) tuple, it could be allowed + * to be re-assigned regardless of RMRR presence. But let's deal with + * that case only if it is actually found in the wild. + */ + else if ( prev_present && (mode & MAP_WITH_RMRR) && + domain != pdev->domain ) + ret = -EOPNOTSUPP; /* * Mapping a bridge should, if anything, pass the struct pci_dev of @@ -1546,7 +1690,7 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, */ if ( ret >= 0 ) ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, - NULL); + NULL, mode); /* * Devices behind PCIe-to-PCI/PCIx bridge may generate different @@ -1561,10 +1705,15 @@ static int domain_context_mapping(struct domain *domain, u8 devfn, if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && (secbus != pdev->bus || pdev->devfn != 0) ) ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, - NULL); + NULL, mode); if ( ret ) - domain_context_unmap(domain, devfn, pdev); + { + if ( !prev_present ) + domain_context_unmap(domain, devfn, pdev); + else if ( pdev->domain != domain ) /* Avoid infinite recursion. */ + domain_context_mapping(pdev->domain, devfn, pdev); + } break; @@ -2331,9 +2480,8 @@ static int reassign_device_ownership( { int ret; - ret = domain_context_unmap(source, devfn, pdev); - if ( ret ) - return ret; + if ( !has_arch_pdevs(target) ) + vmx_pi_hooks_assign(target); /* * Devices assigned to untrusted domains (here assumed to be any domU) @@ -2343,6 +2491,31 @@ static int reassign_device_ownership( if ( (target != hardware_domain) && !iommu_intremap ) untrusted_msi = true; + ret = domain_context_mapping(target, devfn, pdev); + if ( ret ) + { + if ( !has_arch_pdevs(target) ) + vmx_pi_hooks_deassign(target); + return ret; + } + + if ( pdev->devfn == devfn ) + { + const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); + + if ( drhd ) + check_cleanup_domid_map(source, pdev, drhd->iommu); + } + + if ( devfn == pdev->devfn && pdev->domain != target ) + { + list_move(&pdev->domain_list, &target->pdev_list); + pdev->domain = target; + } + + if ( !has_arch_pdevs(source) ) + vmx_pi_hooks_deassign(source); + /* * If the device belongs to the hardware domain, and it has RMRR, don't * remove it from the hardware domain, because BIOS may use RMRR at @@ -2371,34 +2544,7 @@ static int reassign_device_ownership( } } - if ( devfn == pdev->devfn && pdev->domain != dom_io ) - { - list_move(&pdev->domain_list, &dom_io->pdev_list); - pdev->domain = dom_io; - } - - if ( !has_arch_pdevs(source) ) - vmx_pi_hooks_deassign(source); - - if ( !has_arch_pdevs(target) ) - vmx_pi_hooks_assign(target); - - ret = domain_context_mapping(target, devfn, pdev); - if ( ret ) - { - if ( !has_arch_pdevs(target) ) - vmx_pi_hooks_deassign(target); - - return ret; - } - - if ( devfn == pdev->devfn && pdev->domain != target ) - { - list_move(&pdev->domain_list, &target->pdev_list); - pdev->domain = target; - } - - return ret; + return 0; } static int intel_iommu_assign_device( diff --git a/xen/drivers/passthrough/vtd/iommu.h b/xen/drivers/passthrough/vtd/iommu.h index 32b39c606a..503b07ffb7 100644 --- a/xen/drivers/passthrough/vtd/iommu.h +++ b/xen/drivers/passthrough/vtd/iommu.h @@ -202,8 +202,12 @@ struct root_entry { do {(root).val |= ((value) & PAGE_MASK_4K);} while(0) struct context_entry { - u64 lo; - u64 hi; + union { + struct { + uint64_t lo, hi; + }; + __uint128_t full; + }; }; #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry)) #define context_present(c) ((c).lo & 1) diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c index 435e449ca3..99e159b4e9 100644 --- a/xen/drivers/passthrough/vtd/quirks.c +++ b/xen/drivers/passthrough/vtd/quirks.c @@ -343,7 +343,8 @@ void __init platform_quirks_init(void) */ static int __must_check map_me_phantom_function(struct domain *domain, - u32 dev, int map) + unsigned int dev, + unsigned int mode) { struct acpi_drhd_unit *drhd; struct pci_dev *pdev; @@ -354,9 +355,9 @@ static int __must_check map_me_phantom_function(struct domain *domain, drhd = acpi_find_matched_drhd_unit(pdev); /* map or unmap ME phantom function */ - if ( map ) + if ( !(mode & UNMAP_ME_PHANTOM_FUNC) ) rc = domain_context_mapping_one(domain, drhd->iommu, 0, - PCI_DEVFN(dev, 7), NULL); + PCI_DEVFN(dev, 7), NULL, mode); else rc = domain_context_unmap_one(domain, drhd->iommu, 0, PCI_DEVFN(dev, 7)); @@ -364,7 +365,8 @@ static int __must_check map_me_phantom_function(struct domain *domain, return rc; } -int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) +int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, + unsigned int mode) { u32 id; int rc = 0; @@ -388,7 +390,7 @@ int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) case 0x423b8086: case 0x423c8086: case 0x423d8086: - rc = map_me_phantom_function(domain, 3, map); + rc = map_me_phantom_function(domain, 3, mode); break; default: break; @@ -414,7 +416,7 @@ int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map) case 0x42388086: /* Puma Peak */ case 0x422b8086: case 0x422c8086: - rc = map_me_phantom_function(domain, 22, map); + rc = map_me_phantom_function(domain, 22, mode); break; default: break; diff --git a/xen/drivers/passthrough/vtd/vtd.h b/xen/drivers/passthrough/vtd/vtd.h index bb8889f350..e4ab242fee 100644 --- a/xen/drivers/passthrough/vtd/vtd.h +++ b/xen/drivers/passthrough/vtd/vtd.h @@ -22,8 +22,14 @@ #include <xen/iommu.h> -#define MAP_ME_PHANTOM_FUNC 1 -#define UNMAP_ME_PHANTOM_FUNC 0 +/* + * Values for domain_context_mapping_one()'s and me_wifi_quirk()'s "mode" + * parameters. + */ +#define MAP_WITH_RMRR (1u << 0) +#define MAP_OWNER_DYING (1u << 1) +#define MAP_SINGLE_DEVICE (1u << 2) +#define UNMAP_ME_PHANTOM_FUNC (1u << 3) /* Allow for both IOAPIC and IOSAPIC. */ #define IO_xAPIC_route_entry IO_APIC_route_entry -- generated by git-patchbot for /home/xen/git/xen.git#stable-4.13
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |