IOMMU: don't immediately disable bus mastering on faults Instead, give the owning domain at least a small opportunity of fixing things up, and allow for rare faults to not bring down the device at all. The amount of faults tolerated within a given time period (all numbers are made up with no specific rationale) is higher for Dom0 than for DomU-s. Signed-off-by: Jan Beulich --- a/xen/drivers/passthrough/amd/iommu_init.c +++ b/xen/drivers/passthrough/amd/iommu_init.c @@ -566,7 +566,7 @@ static hw_irq_controller iommu_maskable_ static void parse_event_log_entry(struct amd_iommu *iommu, u32 entry[]) { - u16 domain_id, device_id, bdf, cword, flags; + u16 domain_id, device_id, bdf, flags; u32 code; u64 *addr; int count = 0; @@ -620,18 +620,10 @@ static void parse_event_log_entry(struct "fault address = %#"PRIx64", flags = %#x\n", event_str[code-1], domain_id, device_id, *addr, flags); - /* Tell the device to stop DMAing; we can't rely on the guest to - * control it for us. */ for ( bdf = 0; bdf < ivrs_bdf_entries; bdf++ ) if ( get_dma_requestor_id(iommu->seg, bdf) == device_id ) - { - cword = pci_conf_read16(iommu->seg, PCI_BUS(bdf), - PCI_SLOT(bdf), PCI_FUNC(bdf), - PCI_COMMAND); - pci_conf_write16(iommu->seg, PCI_BUS(bdf), PCI_SLOT(bdf), - PCI_FUNC(bdf), PCI_COMMAND, - cword & ~PCI_COMMAND_MASTER); - } + pci_check_disable_device(iommu->seg, PCI_BUS(bdf), + PCI_DEVFN2(bdf)); } else { --- a/xen/drivers/passthrough/iommu.c +++ b/xen/drivers/passthrough/iommu.c @@ -215,6 +215,7 @@ static int device_assigned(u16 seg, u8 b static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn) { struct hvm_iommu *hd = domain_hvm_iommu(d); + struct pci_dev *pdev; int rc = 0; if ( !iommu_enabled || !hd->platform_ops ) @@ -228,6 +229,13 @@ static int assign_device(struct domain * return -EXDEV; spin_lock(&pcidevs_lock); + pdev = pci_get_pdev(seg, bus, devfn); + if ( pdev ) + { + pdev->fault.threshold = PT_FAULT_THRESHOLD_UNPRIV; + pdev->fault.count = 0; + } + if ( (rc = hd->platform_ops->assign_device(d, seg, bus, devfn)) ) goto done; @@ -239,6 +247,8 @@ static int assign_device(struct domain * goto done; } done: + if ( pdev && pdev->domain != d ) + pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV; spin_unlock(&pcidevs_lock); return rc; } @@ -379,6 +389,9 @@ int deassign_device(struct domain *d, u1 return ret; } + pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV; + pdev->fault.count = 0; + if ( !has_arch_pdevs(d) && need_iommu(d) ) { d->need_iommu = 0; --- a/xen/drivers/passthrough/pci.c +++ b/xen/drivers/passthrough/pci.c @@ -137,6 +137,7 @@ static struct pci_dev *alloc_pdev(struct *((u8*) &pdev->bus) = bus; *((u8*) &pdev->devfn) = devfn; pdev->domain = NULL; + pdev->fault.threshold = PT_FAULT_THRESHOLD_PRIV; INIT_LIST_HEAD(&pdev->msi_list); list_add(&pdev->alldevs_list, &pseg->alldevs_list); spin_lock_init(&pdev->msix_table_lock); @@ -672,6 +673,36 @@ int __init pci_device_detect(u16 seg, u8 return 1; } +void pci_check_disable_device(u16 seg, u8 bus, u8 devfn) +{ + struct pci_dev *pdev; + s_time_t now = NOW(); + u16 cword; + + spin_lock(&pcidevs_lock); + pdev = pci_get_pdev(seg, bus, devfn); + if ( pdev ) + { + if ( now < pdev->fault.time || + now - pdev->fault.time > MILLISECS(10) ) + pdev->fault.count >>= 1; + pdev->fault.time = now; + if ( ++pdev->fault.count < pdev->fault.threshold ) + pdev = NULL; + } + spin_unlock(&pcidevs_lock); + + if ( !pdev ) + return; + + /* Tell the device to stop DMAing; we can't rely on the guest to + * control it for us. */ + cword = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + PCI_COMMAND); + pci_conf_write16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), + PCI_COMMAND, cword & ~PCI_COMMAND_MASTER); +} + /* * scan pci devices to add all existed PCI devices to alldevs_list, * and setup pci hierarchy in array bus2bridge. --- a/xen/drivers/passthrough/vtd/iommu.c +++ b/xen/drivers/passthrough/vtd/iommu.c @@ -917,7 +917,7 @@ static void __do_iommu_page_fault(struct while (1) { u8 fault_reason; - u16 source_id, cword; + u16 source_id; u32 data; u64 guest_addr; int type; @@ -950,14 +950,8 @@ static void __do_iommu_page_fault(struct iommu_page_fault_do_one(iommu, type, fault_reason, source_id, guest_addr); - /* Tell the device to stop DMAing; we can't rely on the guest to - * control it for us. */ - cword = pci_conf_read16(iommu->intel->drhd->segment, - PCI_BUS(source_id), PCI_SLOT(source_id), - PCI_FUNC(source_id), PCI_COMMAND); - pci_conf_write16(iommu->intel->drhd->segment, PCI_BUS(source_id), - PCI_SLOT(source_id), PCI_FUNC(source_id), - PCI_COMMAND, cword & ~PCI_COMMAND_MASTER); + pci_check_disable_device(iommu->intel->drhd->segment, + PCI_BUS(source_id), PCI_DEVFN2(source_id)); fault_index++; if ( fault_index > cap_num_fault_regs(iommu->cap) ) --- a/xen/include/xen/pci.h +++ b/xen/include/xen/pci.h @@ -64,6 +64,13 @@ struct pci_dev { const u8 devfn; struct pci_dev_info info; struct arch_pci_dev arch; + struct { + s_time_t time; + unsigned int threshold; +#define PT_FAULT_THRESHOLD_PRIV 10 +#define PT_FAULT_THRESHOLD_UNPRIV 3 + unsigned int count; + } fault; u64 vf_rlen[6]; }; @@ -107,6 +114,7 @@ int pci_hide_device(int bus, int devfn); struct pci_dev *pci_get_pdev(int seg, int bus, int devfn); struct pci_dev *pci_get_pdev_by_domain( struct domain *, int seg, int bus, int devfn); +void pci_check_disable_device(u16 seg, u8 bus, u8 devfn); uint8_t pci_conf_read8( unsigned int seg, unsigned int bus, unsigned int dev, unsigned int func,