|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH] AMD IOMMU: add mechanism to protect their PCI devices' config spaces
On 22/06/2012 09:18, "Jan Beulich" <JBeulich@xxxxxxxx> wrote:
> Recent Dom0 kernels want to disable PCI MSI on all devices, yet doing
> so on AMD IOMMUs (which get represented by a PCI device) disables part
> of the functionality set up by the hypervisor.
>
> Add a mechanism to mark certain PCI devices as having write protected
> config spaces (both through port based [method 1] accesses and, for
> x86-64, mmconfig), and use that for AMD's IOMMUs.
>
> Note that due to ptwr_do_page_fault() being run first, there'll be a
> MEM_LOG() issued for each such mmconfig based write attempt. If that's
> undesirable, the order of the calls in fixup_page_fault() would need
> to be swapped.
>
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> Tested-by: Wei Wang <wei.wang2@xxxxxxx>
Acked-by: Keir Fraser <keir@xxxxxxx>
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -5209,6 +5209,97 @@ int ptwr_do_page_fault(struct vcpu *v, u
> return 0;
> }
>
> +#ifdef __x86_64__
> +/*************************
> + * fault handling for read-only MMIO pages
> + */
> +
> +struct mmio_ro_emulate_ctxt {
> + struct x86_emulate_ctxt ctxt;
> + unsigned long cr2;
> +};
> +
> +static int mmio_ro_emulated_read(
> + enum x86_segment seg,
> + unsigned long offset,
> + void *p_data,
> + unsigned int bytes,
> + struct x86_emulate_ctxt *ctxt)
> +{
> + return X86EMUL_UNHANDLEABLE;
> +}
> +
> +static int mmio_ro_emulated_write(
> + enum x86_segment seg,
> + unsigned long offset,
> + void *p_data,
> + unsigned int bytes,
> + struct x86_emulate_ctxt *ctxt)
> +{
> + struct mmio_ro_emulate_ctxt *mmio_ro_ctxt =
> + container_of(ctxt, struct mmio_ro_emulate_ctxt, ctxt);
> +
> + /* Only allow naturally-aligned stores at the original %cr2 address. */
> + if ( ((bytes | offset) & (bytes - 1)) || offset != mmio_ro_ctxt->cr2 )
> + {
> + MEM_LOG("mmio_ro_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
> + mmio_ro_ctxt->cr2, offset, bytes);
> + return X86EMUL_UNHANDLEABLE;
> + }
> +
> + return X86EMUL_OKAY;
> +}
> +
> +static const struct x86_emulate_ops mmio_ro_emulate_ops = {
> + .read = mmio_ro_emulated_read,
> + .insn_fetch = ptwr_emulated_read,
> + .write = mmio_ro_emulated_write,
> +};
> +
> +/* Check if guest is trying to modify a r/o MMIO page. */
> +int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr,
> + struct cpu_user_regs *regs)
> +{
> + l1_pgentry_t pte;
> + unsigned long mfn;
> + unsigned int addr_size = is_pv_32on64_domain(v->domain) ?
> + 32 : BITS_PER_LONG;
> + struct mmio_ro_emulate_ctxt mmio_ro_ctxt = {
> + .ctxt.regs = regs,
> + .ctxt.addr_size = addr_size,
> + .ctxt.sp_size = addr_size,
> + .cr2 = addr
> + };
> + int rc;
> +
> + /* Attempt to read the PTE that maps the VA being accessed. */
> + guest_get_eff_l1e(v, addr, &pte);
> +
> + /* We are looking only for read-only mappings of MMIO pages. */
> + if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT|_PAGE_RW)) != _PAGE_PRESENT) )
> + return 0;
> +
> + mfn = l1e_get_pfn(pte);
> + if ( mfn_valid(mfn) )
> + {
> + struct page_info *page = mfn_to_page(mfn);
> + struct domain *owner = page_get_owner_and_reference(page);
> +
> + if ( owner )
> + put_page(page);
> + if ( owner != dom_io )
> + return 0;
> + }
> +
> + if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
> + return 0;
> +
> + rc = x86_emulate(&mmio_ro_ctxt.ctxt, &mmio_ro_emulate_ops);
> +
> + return rc != X86EMUL_UNHANDLEABLE ? EXCRET_fault_fixed : 0;
> +}
> +#endif /* __x86_64__ */
> +
> void free_xen_pagetable(void *v)
> {
> if ( system_state == SYS_STATE_early_boot )
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -1349,20 +1349,23 @@ static int fixup_page_fault(unsigned lon
> return 0;
> }
>
> - if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
> - guest_kernel_mode(v, regs) )
> - {
> - unsigned int mbs = PFEC_write_access;
> - unsigned int mbz = PFEC_reserved_bit | PFEC_insn_fetch;
> -
> - /* Do not check if access-protection fault since the page may
> - legitimately be not present in shadow page tables */
> - if ( !paging_mode_enabled(d) )
> - mbs |= PFEC_page_present;
> -
> - if ( ((regs->error_code & (mbs | mbz)) == mbs) &&
> + if ( guest_kernel_mode(v, regs) &&
> + !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) &&
> + (regs->error_code & PFEC_write_access) )
> + {
> + if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) &&
> + /* Do not check if access-protection fault since the page may
> + legitimately be not present in shadow page tables */
> + (paging_mode_enabled(d) ||
> + (regs->error_code & PFEC_page_present)) &&
> ptwr_do_page_fault(v, addr, regs) )
> return EXCRET_fault_fixed;
> +
> +#ifdef __x86_64__
> + if ( IS_PRIV(d) && (regs->error_code & PFEC_page_present) &&
> + mmio_ro_do_page_fault(v, addr, regs) )
> + return EXCRET_fault_fixed;
> +#endif
> }
>
> /* For non-external shadowed guests, we fix up both their own
> @@ -1690,6 +1693,13 @@ static int pci_cfg_ok(struct domain *d,
> return 0;
>
> machine_bdf = (d->arch.pci_cf8 >> 8) & 0xFFFF;
> + if ( write )
> + {
> + const unsigned long *ro_map = pci_get_ro_map(0);
> +
> + if ( ro_map && test_bit(machine_bdf, ro_map) )
> + return 0;
> + }
> start = d->arch.pci_cf8 & 0xFF;
> end = start + size - 1;
> if (xsm_pci_config_permission(d, machine_bdf, start, end, write))
> --- a/xen/arch/x86/x86_32/pci.c
> +++ b/xen/arch/x86/x86_32/pci.c
> @@ -6,6 +6,7 @@
>
> #include <xen/spinlock.h>
> #include <xen/pci.h>
> +#include <xen/init.h>
> #include <asm/io.h>
>
> #define PCI_CONF_ADDRESS(bus, dev, func, reg) \
> @@ -70,3 +71,7 @@ void pci_conf_write32(
> BUG_ON((bus > 255) || (dev > 31) || (func > 7) || (reg > 255));
> pci_conf_write(PCI_CONF_ADDRESS(bus, dev, func, reg), 0, 4, data);
> }
> +
> +void __init arch_pci_ro_device(int seg, int bdf)
> +{
> +}
> --- a/xen/arch/x86/x86_64/mmconfig_64.c
> +++ b/xen/arch/x86/x86_64/mmconfig_64.c
> @@ -14,6 +14,8 @@
> #include <xen/xmalloc.h>
> #include <xen/pci.h>
> #include <xen/pci_regs.h>
> +#include <xen/iommu.h>
> +#include <xen/rangeset.h>
>
> #include "mmconfig.h"
>
> @@ -132,9 +134,30 @@ static void __iomem *mcfg_ioremap(const
> return (void __iomem *) virt;
> }
>
> +void arch_pci_ro_device(int seg, int bdf)
> +{
> + unsigned int idx, bus = PCI_BUS(bdf);
> +
> + for (idx = 0; idx < pci_mmcfg_config_num; ++idx) {
> + const struct acpi_mcfg_allocation *cfg = pci_mmcfg_virt[idx].cfg;
> + unsigned long mfn = (cfg->address >> PAGE_SHIFT) + bdf;
> +
> + if (!pci_mmcfg_virt[idx].virt || cfg->pci_segment != seg ||
> + cfg->start_bus_number > bus || cfg->end_bus_number < bus)
> + continue;
> +
> + if (rangeset_add_singleton(mmio_ro_ranges, mfn))
> + printk(XENLOG_ERR
> + "%04x:%02x:%02x.%u: could not mark MCFG (mfn %#lx)
> read-only\n",
> + cfg->pci_segment, bus, PCI_SLOT(bdf), PCI_FUNC(bdf),
> + mfn);
> + }
> +}
> +
> int pci_mmcfg_arch_enable(unsigned int idx)
> {
> const typeof(pci_mmcfg_config[0]) *cfg = pci_mmcfg_virt[idx].cfg;
> + const unsigned long *ro_map = pci_get_ro_map(cfg->pci_segment);
>
> if (pci_mmcfg_virt[idx].virt)
> return 0;
> @@ -146,6 +169,16 @@ int pci_mmcfg_arch_enable(unsigned int i
> }
> printk(KERN_INFO "PCI: Using MCFG for segment %04x bus %02x-%02x\n",
> cfg->pci_segment, cfg->start_bus_number, cfg->end_bus_number);
> + if (ro_map) {
> + unsigned int bdf = PCI_BDF(cfg->start_bus_number, 0, 0);
> + unsigned int end = PCI_BDF(cfg->end_bus_number, -1, -1);
> +
> + while ((bdf = find_next_bit(ro_map, end + 1, bdf)) <= end) {
> + arch_pci_ro_device(cfg->pci_segment, bdf);
> + if (bdf++ == end)
> + break;
> + }
> + }
> return 0;
> }
>
> --- a/xen/drivers/passthrough/amd/iommu_detect.c
> +++ b/xen/drivers/passthrough/amd/iommu_detect.c
> @@ -153,6 +153,12 @@ int __init amd_iommu_detect_one_acpi(
> if ( rt )
> return -ENODEV;
>
> + rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func));
> + if ( rt )
> + printk(XENLOG_ERR
> + "Could not mark config space of %04x:%02x:%02x.%u read-only
> (%d)\n",
> + iommu->seg, bus, dev, func, rt);
> +
> list_add_tail(&iommu->list, &amd_iommu_head);
>
> return 0;
> --- a/xen/drivers/passthrough/io.c
> +++ b/xen/drivers/passthrough/io.c
> @@ -593,11 +593,3 @@ void hvm_dpci_eoi(struct domain *d, unsi
> unlock:
> spin_unlock(&d->event_lock);
> }
> -
> -static int __init setup_mmio_ro_ranges(void)
> -{
> - mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
> - RANGESETF_prettyprint_hex);
> - return 0;
> -}
> -__initcall(setup_mmio_ro_ranges);
> --- a/xen/drivers/passthrough/pci.c
> +++ b/xen/drivers/passthrough/pci.c
> @@ -36,6 +36,7 @@
> struct pci_seg {
> struct list_head alldevs_list;
> u16 nr;
> + unsigned long *ro_map;
> /* bus2bridge_lock protects bus2bridge array */
> spinlock_t bus2bridge_lock;
> #define MAX_BUSES 256
> @@ -106,6 +107,8 @@ void __init pt_pci_init(void)
> radix_tree_init(&pci_segments);
> if ( !alloc_pseg(0) )
> panic("Could not initialize PCI segment 0\n");
> + mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
> + RANGESETF_prettyprint_hex);
> }
>
> int __init pci_add_segment(u16 seg)
> @@ -113,6 +116,13 @@ int __init pci_add_segment(u16 seg)
> return alloc_pseg(seg) ? 0 : -ENOMEM;
> }
>
> +const unsigned long *pci_get_ro_map(u16 seg)
> +{
> + struct pci_seg *pseg = get_pseg(seg);
> +
> + return pseg ? pseg->ro_map : NULL;
> +}
> +
> static struct pci_dev *alloc_pdev(struct pci_seg *pseg, u8 bus, u8 devfn)
> {
> struct pci_dev *pdev;
> @@ -198,6 +208,33 @@ static void free_pdev(struct pci_seg *ps
> xfree(pdev);
> }
>
> +int __init pci_ro_device(int seg, int bus, int devfn)
> +{
> + struct pci_seg *pseg = alloc_pseg(seg);
> + struct pci_dev *pdev;
> +
> + if ( !pseg )
> + return -ENOMEM;
> + pdev = alloc_pdev(pseg, bus, devfn);
> + if ( !pdev )
> + return -ENOMEM;
> +
> + if ( !pseg->ro_map )
> + {
> + size_t sz = BITS_TO_LONGS(PCI_BDF(-1, -1, -1) + 1) * sizeof(long);
> +
> + pseg->ro_map = alloc_xenheap_pages(get_order_from_bytes(sz), 0);
> + if ( !pseg->ro_map )
> + return -ENOMEM;
> + memset(pseg->ro_map, 0, sz);
> + }
> +
> + __set_bit(PCI_BDF2(bus, devfn), pseg->ro_map);
> + arch_pci_ro_device(seg, PCI_BDF2(bus, devfn));
> +
> + return 0;
> +}
> +
> struct pci_dev *pci_get_pdev(int seg, int bus, int devfn)
> {
> struct pci_seg *pseg = get_pseg(seg);
> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -555,6 +555,8 @@ void memguard_unguard_stack(void *p);
>
> int ptwr_do_page_fault(struct vcpu *, unsigned long,
> struct cpu_user_regs *);
> +int mmio_ro_do_page_fault(struct vcpu *, unsigned long,
> + struct cpu_user_regs *);
>
> int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
>
> --- a/xen/include/xen/pci.h
> +++ b/xen/include/xen/pci.h
> @@ -98,8 +98,11 @@ struct pci_dev *pci_lock_domain_pdev(
> void setup_dom0_pci_devices(struct domain *, void (*)(struct pci_dev *));
> void pci_release_devices(struct domain *d);
> int pci_add_segment(u16 seg);
> +const unsigned long *pci_get_ro_map(u16 seg);
> int pci_add_device(u16 seg, u8 bus, u8 devfn, const struct pci_dev_info *);
> int pci_remove_device(u16 seg, u8 bus, u8 devfn);
> +int pci_ro_device(int seg, int bus, int devfn);
> +void arch_pci_ro_device(int seg, int bdf);
> struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
> struct pci_dev *pci_get_pdev_by_domain(
> struct domain *, int seg, int bus, int devfn);
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |