[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] xen/mcfg: Call PHYSDEVOP_pci_mmcfg_reserved before PCI enumeration



On Tue, Sep 22, 2015 at 01:17:03PM -0700, Ed Swierk wrote:
> RFC. Tested with 3.14.51 and Xen 4.5.1. I'll make a proper patch against a
> more current kernel if we decide this is heading in the right direction.

CC-ing David as well
> 
> 
>     xen/mcfg: Notify Xen of PCI MMCONFIG area before adding device
> 
>     On systems where the ACPI DSDT advertises a PCI MMCONFIG area but the
> E820
>     table does not reserve it, it's up to Dom0 to inform Xen of MMCONFIG
> areas
>     via PHYSDEVOP_pci_mmcfg_reserved.  This needs to happen before Xen
> tries to
>     access extended capabilities like SRIOV in pci_add_device(), which is
>     triggered by PHYSDEVOP_pci_device_add et al.
> 
>     Since both MMCONFIG discovery and PCI bus enumeration occur in
> acpi_init(),
>     calling PHYSDEVOP_pci_mmcfg_reserved cannot be delegated to a separate
>     initcall.  Instead, it can be done in xen_pci_notifier() immediately
> before
>     calling PHYSDEVOP_pci_device_add.
> 
>     Without this change, Xen 4.4 and 4.5 emit WARN messsages from
>     msix_capability_init() when setting up Intel 82599 VFs, since vf_rlen
> has
>     not been initialized by pci_add_device().  And on Xen 4.5, Xen nukes the
>     DomU due to "Potentially insecure use of MSI-X" when the VF driver
> loads in
>     the DomU.  Both problems are fixed by this change.
> 
>     Signed-off-by: Ed Swierk <eswierk@xxxxxxxxxxxxxxxxxx>
> 
> diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
> index dd9c249..47f6b45 100644
> --- a/drivers/xen/pci.c
> +++ b/drivers/xen/pci.c
> @@ -26,8 +26,57 @@
>  #include <asm/xen/hypervisor.h>
>  #include <asm/xen/hypercall.h>
>  #include "../pci/pci.h"
> +
>  #ifdef CONFIG_PCI_MMCONFIG
>  #include <asm/pci_x86.h>
> +#include <linux/list.h>
> +
> +/* pci_mmcfg_list entries that have already been reported to xen */
> +LIST_HEAD(xen_pci_mmcfg_list);
> +
> +static void xen_report_pci_mmcfg_region(u16 segment, u8 bus)
> +{
> + struct pci_mmcfg_region *cfg, *new;
> + struct physdev_pci_mmcfg_reserved r;
> + int rc;
> +
> + if ((pci_probe & PCI_PROBE_MMCONF) == 0)
> + return;
> +
> + cfg = pci_mmconfig_lookup(segment, bus);
> + if (!cfg)
> + return;
> +
> + list_for_each_entry_rcu(new, &xen_pci_mmcfg_list, list) {
> + if (new->segment == cfg->segment &&
> +    new->start_bus == cfg->start_bus &&
> +    new->end_bus == cfg->end_bus)
> + return;
> + }
> +
> + r.address = cfg->address;
> + r.segment = cfg->segment;
> + r.start_bus = cfg->start_bus;
> + r.end_bus = cfg->end_bus;
> + r.flags = XEN_PCI_MMCFG_RESERVED;
> + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
> + if (rc != 0 && rc != -ENOSYS) {
> + pr_warn("Failed to report MMCONFIG reservation state for %s"
> + " to hypervisor (%d)\n", cfg->name, rc);
> + }
> +
> + new = kmemdup(cfg, sizeof(*new), GFP_KERNEL);
> + if (new) {
> + list_add_tail_rcu(&new->list, &xen_pci_mmcfg_list);
> + } else {
> + pr_warn("Failed to allocate xen_pci_mmcfg_list entry\n");
> + }
> +}
> +
> +#else
> +
> +static void xen_report_pci_mmcfg_region(u16 segment, u8 bus) { }
> +
>  #endif
> 
>  static bool __read_mostly pci_seg_supported = true;
> @@ -86,6 +135,7 @@ static int xen_add_device(struct device *dev)
>   }
>  #endif /* CONFIG_ACPI */
> 
> + xen_report_pci_mmcfg_region(add.seg, add.bus);
>   r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add);
>   if (r != -ENOSYS)
>   return r;
> @@ -104,6 +154,7 @@ static int xen_add_device(struct device *dev)
>   .physfn.devfn = physfn->devfn,
>   };
> 
> + xen_report_pci_mmcfg_region(0, manage_pci_ext.bus);
>   r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
>   &manage_pci_ext);
>   }
> @@ -115,6 +166,7 @@ static int xen_add_device(struct device *dev)
>   .is_extfn = 1,
>   };
> 
> + xen_report_pci_mmcfg_region(0, manage_pci_ext.bus);
>   r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
>   &manage_pci_ext);
>   } else {
> @@ -123,6 +175,7 @@ static int xen_add_device(struct device *dev)
>   .devfn = pci_dev->devfn,
>   };
> 
> + xen_report_pci_mmcfg_region(0, manage_pci.bus);
>   r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
>   &manage_pci);
>   }
> @@ -142,6 +195,7 @@ static int xen_remove_device(struct device *dev)
>   .devfn = pci_dev->devfn
>   };
> 
> + xen_report_pci_mmcfg_region(device.seg, device.bus);
>   r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove,
>    &device);
>   } else if (pci_domain_nr(pci_dev->bus))
> @@ -152,6 +206,7 @@ static int xen_remove_device(struct device *dev)
>   .devfn = pci_dev->devfn
>   };
> 
> + xen_report_pci_mmcfg_region(0, manage_pci.bus);
>   r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
>    &manage_pci);
>   }
> @@ -195,49 +250,3 @@ static int __init register_xen_pci_notifier(void)
>  }
> 
>  arch_initcall(register_xen_pci_notifier);
> -
> -#ifdef CONFIG_PCI_MMCONFIG
> -static int __init xen_mcfg_late(void)
> -{
> - struct pci_mmcfg_region *cfg;
> - int rc;
> -
> - if (!xen_initial_domain())
> - return 0;
> -
> - if ((pci_probe & PCI_PROBE_MMCONF) == 0)
> - return 0;
> -
> - if (list_empty(&pci_mmcfg_list))
> - return 0;
> -
> - /* Check whether they are in the right area. */
> - list_for_each_entry(cfg, &pci_mmcfg_list, list) {
> - struct physdev_pci_mmcfg_reserved r;
> -
> - r.address = cfg->address;
> - r.segment = cfg->segment;
> - r.start_bus = cfg->start_bus;
> - r.end_bus = cfg->end_bus;
> - r.flags = XEN_PCI_MMCFG_RESERVED;
> -
> - rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
> - switch (rc) {
> - case 0:
> - case -ENOSYS:
> - continue;
> -
> - default:
> - pr_warn("Failed to report MMCONFIG reservation"
> - " state for %s to hypervisor"
> - " (%d)\n",
> - cfg->name, rc);
> - }
> - }
> - return 0;
> -}
> -/*
> - * Needs to be done after acpi_init which are subsys_initcall.
> - */
> -subsys_initcall_sync(xen_mcfg_late);
> -#endif
> 
> 
> On Tue, Sep 22, 2015 at 8:09 AM, Jan Beulich <JBeulich@xxxxxxxx> wrote:
> 
> > >>> On 22.09.15 at 16:37, <konrad.wilk@xxxxxxxxxx> wrote:
> > > On Tue, Sep 22, 2015 at 08:11:41AM -0600, Jan Beulich wrote:
> > >> >>> On 22.09.15 at 16:03, <konrad.wilk@xxxxxxxxxx> wrote:
> > >> > On Tue, Sep 22, 2015 at 07:52:19AM -0600, Jan Beulich wrote:
> > >> >> >>> On 22.09.15 at 15:39, <konrad.wilk@xxxxxxxxxx> wrote:
> > >> >> > On Tue, Sep 22, 2015 at 06:26:11AM -0700, Ed Swierk wrote:
> > >> >> >> Any other ideas?
> > >> >> >
> > >> >> > I like it - as it will update it right away. However we would need
> > some
> > >> >> > extra smarts in Xen to reconfigure its view of the PCI device now
> > that the
> > >> >> > extended configuration space has become available.
> > >> >>
> > >> >> What parts are you thinking of that would need updating (and
> > >> >> aren't getting updated already)?
> > >> >
> > >> > The VF data. As before this call Xen might have not been able to
> > >> > get to the extended configuration.
> > >>
> > >> I still don't understand: Afaics pci_add_device() updates everything
> > >> that may need updating already. And things appear to be working
> > >> fine with our kernel (where the MMCFG notification comes right out
> > >> of the x86 code earlier referred to), despite this meaning updates
> > >> to the data collected during early boot. I guess you'll need to be
> > >> more specific on what you see missing...
> > >
> > > This is all ancient recollection of what I had seen a year or so ago
> > > so take it a with a cup of salt.
> >
> > Urgh - a whole cup...
> >
> > > I have one of those Intel machines where the MMCFG is not marked
> > > in the E820 but is in the ACPI and I remember getting frequent
> > > WARN_ON from Xen in the msi.c code when doing passthrough on the VF.
> > > I don't have the logs but my vague recollection was that Xen could
> > > not validate the VF's MSI-X because at the time it gathered the
> > > VF PCI device information the extended configurations were not
> > > available. This was prior to the XSA120 discovery so ancient
> > > code.
> >
> > Well, VFs should not even show up prior to MMCFG getting
> > announced.
> >
> > Jan
> >
> >

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.