[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] Re: [PATCH 07/11] Xen/x86/PCI: Add support for the Xen PCI subsytem



On Thu, May 07, 2009 at 02:45:33PM -0700, Jeremy Fitzhardinge wrote:
> From: Alex Nixon <alex.nixon@xxxxxxxxxx>
> 
> Impact: add core of Xen PCI support
> 
> On boot, the system will search to see if a Xen iommu/pci subsystem is
> available.  If the kernel detects it's running in a domain rather than
> on bare hardware, this subsystem will be used.  Otherwise, it falls
> back to using hardware as usual.
> 
> The frontend stub lives in arch/x86/pci-xen.c, alongside other
> sub-arch PCI init code (e.g. olpc.c)
> 
> (All subsequent fixes, API changes and swiotlb operations folded in.)
> 
> Signed-off-by: Alex Nixon <alex.nixon@xxxxxxxxxx>
> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@xxxxxxxxxx>
> Signed-off-by: Ian Campbell <ian.campbell@xxxxxxxxxx>
> Reviewed-by: "H. Peter Anvin" <hpa@xxxxxxxxx>
> ---
>  arch/x86/Kconfig                 |    4 +
>  arch/x86/include/asm/pci_x86.h   |    1 +
>  arch/x86/include/asm/xen/iommu.h |   12 ++
>  arch/x86/kernel/pci-dma.c        |    3 +
>  arch/x86/pci/Makefile            |    1 +
>  arch/x86/pci/init.c              |    6 +
>  arch/x86/pci/xen.c               |   52 +++++++
>  drivers/pci/Makefile             |    2 +
>  drivers/pci/xen-iommu.c          |  302 
> ++++++++++++++++++++++++++++++++++++++
>  9 files changed, 383 insertions(+), 0 deletions(-)
>  create mode 100644 arch/x86/include/asm/xen/iommu.h
>  create mode 100644 arch/x86/pci/xen.c
>  create mode 100644 drivers/pci/xen-iommu.c
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 4b34082..4a62659 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1830,6 +1830,10 @@ config PCI_OLPC
>       def_bool y
>       depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
>  
> +config PCI_XEN
> +     def_bool y
> +     depends on XEN_PCI_PASSTHROUGH || XEN_DOM0_PCI
> +
>  config PCI_DOMAINS
>       def_bool y
>       depends on PCI
> diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
> index 5401ca2..34f03a4 100644
> --- a/arch/x86/include/asm/pci_x86.h
> +++ b/arch/x86/include/asm/pci_x86.h
> @@ -107,6 +107,7 @@ extern int pci_direct_probe(void);
>  extern void pci_direct_init(int type);
>  extern void pci_pcbios_init(void);
>  extern int pci_olpc_init(void);
> +extern int pci_xen_init(void);
>  extern void __init dmi_check_pciprobe(void);
>  extern void __init dmi_check_skip_isa_align(void);
>  
> diff --git a/arch/x86/include/asm/xen/iommu.h 
> b/arch/x86/include/asm/xen/iommu.h
> new file mode 100644
> index 0000000..75df312
> --- /dev/null
> +++ b/arch/x86/include/asm/xen/iommu.h
> @@ -0,0 +1,12 @@
> +#ifndef ASM_X86__XEN_IOMMU_H
> +
> +#ifdef CONFIG_PCI_XEN
> +extern void xen_iommu_init(void);
> +#else
> +static inline void xen_iommu_init(void)
> +{
> +}
> +#endif
> +
> +#endif
> +
> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
> index 745579b..2fffc22 100644
> --- a/arch/x86/kernel/pci-dma.c
> +++ b/arch/x86/kernel/pci-dma.c
> @@ -10,6 +10,7 @@
>  #include <asm/gart.h>
>  #include <asm/calgary.h>
>  #include <asm/amd_iommu.h>
> +#include <asm/xen/iommu.h>
>  
>  static int forbid_dac __read_mostly;
>  
> @@ -275,6 +276,8 @@ static int __init pci_iommu_init(void)
>       dma_debug_add_bus(&pci_bus_type);
>  #endif
>  
> +     xen_iommu_init();
> +     
>       calgary_iommu_init();
>  
>       intel_iommu_init();
> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
> index d49202e..64182c5 100644
> --- a/arch/x86/pci/Makefile
> +++ b/arch/x86/pci/Makefile
> @@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS)                += pcbios.o
>  obj-$(CONFIG_PCI_MMCONFIG)   += mmconfig_$(BITS).o direct.o mmconfig-shared.o
>  obj-$(CONFIG_PCI_DIRECT)     += direct.o
>  obj-$(CONFIG_PCI_OLPC)               += olpc.o
> +obj-$(CONFIG_PCI_XEN)                += xen.o
>  
>  obj-y                                += fixup.o
>  obj-$(CONFIG_ACPI)           += acpi.o
> diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
> index 25a1f8e..4e2f90a 100644
> --- a/arch/x86/pci/init.c
> +++ b/arch/x86/pci/init.c
> @@ -15,10 +15,16 @@ static __init int pci_arch_init(void)
>       if (!(pci_probe & PCI_PROBE_NOEARLY))
>               pci_mmcfg_early_init();
>  
> +#ifdef CONFIG_PCI_XEN
> +     if (!pci_xen_init())
> +             return 0;
> +#endif
> +
>  #ifdef CONFIG_PCI_OLPC
>       if (!pci_olpc_init())
>               return 0;       /* skip additional checks if it's an XO */
>  #endif
> +
>  #ifdef CONFIG_PCI_BIOS
>       pci_pcbios_init();
>  #endif
> diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
> new file mode 100644
> index 0000000..76f803f
> --- /dev/null
> +++ b/arch/x86/pci/xen.c
> @@ -0,0 +1,52 @@
> +/*
> + * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
> + *                      x86 PCI core to support the Xen PCI Frontend
> + *
> + *   Author: Ryan Wilson <hap9@xxxxxxxxxxxxxx>
> + */
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/pci.h>
> +#include <linux/acpi.h>
> +
> +#include <asm/pci_x86.h>
> +
> +#include <asm/xen/hypervisor.h>
> +
> +static int xen_pcifront_enable_irq(struct pci_dev *dev)
> +{
> +     return 0;
> +}
> +
> +extern int isapnp_disable;
> +
> +int __init pci_xen_init(void)
> +{
> +     if (!xen_pv_domain() || xen_initial_domain())
> +             return -ENODEV;
> +
> +     printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
> +
> +     pcibios_set_cache_line_size();
> +
> +     pcibios_enable_irq = xen_pcifront_enable_irq;
> +     pcibios_disable_irq = NULL;
> +
> +#ifdef CONFIG_ACPI
> +     /* Keep ACPI out of the picture */
> +     acpi_noirq = 1;
> +#endif
> +
> +#ifdef CONFIG_ISAPNP
> +     /* Stop isapnp from probing */
> +     isapnp_disable = 1;
> +#endif
> +
> +     /* Ensure a device still gets scanned even if it's fn number
> +      * is non-zero.
> +      */
> +     pci_scan_all_fns = 1;
> +
> +     return 0;
> +}
> +
> diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
> index ba6af16..8db0cb5 100644
> --- a/drivers/pci/Makefile
> +++ b/drivers/pci/Makefile
> @@ -27,6 +27,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o
>  # Build Intel IOMMU support
>  obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
>  
> +# Build Xen IOMMU support
> +obj-$(CONFIG_PCI_XEN) += xen-iommu.o
>  obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
>  
>  obj-$(CONFIG_PCI_IOV) += iov.o
> diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c
> new file mode 100644
> index 0000000..e7a22f1
> --- /dev/null
> +++ b/drivers/pci/xen-iommu.c
> @@ -0,0 +1,302 @@
> +#include <linux/types.h>
> +#include <linux/mm.h>
> +#include <linux/string.h>
> +#include <linux/pci.h>
> +#include <linux/module.h>
> +#include <linux/version.h>
> +#include <linux/scatterlist.h>
> +#include <linux/io.h>
> +#include <linux/bug.h>
> +
> +#include <xen/interface/xen.h>
> +#include <xen/grant_table.h>
> +#include <xen/page.h>
> +#include <xen/xen-ops.h>
> +
> +#include <asm/iommu.h>
> +#include <asm/swiotlb.h>
> +#include <asm/tlbflush.h>
> +
> +#define IOMMU_BUG_ON(test)                           \
> +do {                                                 \
> +     if (unlikely(test)) {                           \
> +             printk(KERN_ALERT "Fatal DMA error! "   \
> +                    "Please use 'swiotlb=force'\n"); \
> +             BUG();                                  \
> +     }                                               \
> +} while (0)
> +
> +/* Print address range with message */
> +#define PAR(msg, addr, size)                                 \
> +do {                                                 \
> +     printk(msg "[%#llx - %#llx]\n",                 \
> +     (unsigned long long)addr,                       \
> +     (unsigned long long)addr + size);               \
> +} while (0)
> +
> +struct dma_coherent_mem {
> +     void            *virt_base;
> +     u32             device_base;
> +     int             size;
> +     int             flags;
> +     unsigned long   *bitmap;
> +};
> +
> +static inline int address_needs_mapping(struct device *hwdev,
> +                                             dma_addr_t addr)
> +{
> +     dma_addr_t mask = 0xffffffff;

You can use DMA_32BIT_MASK here.

> +     int ret;
> +
> +     /* If the device has a mask, use it, otherwise default to 32 bits */
> +     if (hwdev && hwdev->dma_mask)
> +             mask = *hwdev->dma_mask;

I think the check for a valid hwdev->dma_mask is not necessary. Other
IOMMU drivers also don't check for this.

> +
> +     ret = (addr & ~mask) != 0;
> +
> +     if (ret) {
> +             printk(KERN_ERR "dma address needs mapping\n");
> +             printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr);
> +     }
> +     return ret;
> +}
> +
> +static int check_pages_physically_contiguous(unsigned long pfn,
> +                                          unsigned int offset,
> +                                          size_t length)
> +{
> +     unsigned long next_mfn;
> +     int i;
> +     int nr_pages;
> +
> +     next_mfn = pfn_to_mfn(pfn);
> +     nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
> +
> +     for (i = 1; i < nr_pages; i++) {
> +             if (pfn_to_mfn(++pfn) != ++next_mfn)
> +                     return 0;
> +     }
> +     return 1;
> +}
> +
> +static int range_straddles_page_boundary(phys_addr_t p, size_t size)
> +{
> +     unsigned long pfn = PFN_DOWN(p);
> +     unsigned int offset = p & ~PAGE_MASK;
> +
> +     if (offset + size <= PAGE_SIZE)
> +             return 0;

You can use iommu_num_pages here from lib/iommu_helpers.c

> +     if (check_pages_physically_contiguous(pfn, offset, size))
> +             return 0;
> +     return 1;
> +}
> +
> +static inline void xen_dma_unmap_page(struct page *page)
> +{
> +     /* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here
> +      * to deal with foreign pages.  We'll need similar logic here at
> +      * some point.
> +      */
> +}
> +
> +/* Gets dma address of a page */
> +static inline dma_addr_t xen_dma_map_page(struct page *page)
> +{
> +     /* Xen TODO: 2.6.18 xen calls __gnttab_dma_map_page here to deal
> +      * with foreign pages.  We'll need similar logic here at some
> +      * point.
> +      */
> +     return ((dma_addr_t)pfn_to_mfn(page_to_pfn(page))) << PAGE_SHIFT;
> +}
> +
> +static int xen_map_sg(struct device *hwdev, struct scatterlist *sg,
> +                     int nents, int direction)
> +{
> +     struct scatterlist *s;
> +     struct page *page;
> +     int i, rc;
> +
> +     BUG_ON(direction == DMA_NONE);
> +     WARN_ON(nents == 0 || sg[0].length == 0);
> +
> +     for_each_sg(sg, s, nents, i) {
> +             BUG_ON(!sg_page(s));
> +             page = sg_page(s);
> +             s->dma_address = xen_dma_map_page(page) + s->offset;
> +             s->dma_length = s->length;
> +             IOMMU_BUG_ON(range_straddles_page_boundary(
> +                             page_to_phys(page), s->length));

I have a question on this. How do you make sure that the memory to map
does not cross page boundarys? I have a stats counter for x-page
requests in amd iommu code and around 10% of the requests are actually
x-page for me.

> +     }
> +
> +     rc = nents;
> +
> +     flush_write_buffers();
> +     return rc;
> +}
> +
> +static void xen_unmap_sg(struct device *hwdev, struct scatterlist *sg,
> +                      int nents, int direction)
> +{
> +     struct scatterlist *s;
> +     struct page *page;
> +     int i;
> +
> +     for_each_sg(sg, s, nents, i) {
> +             page = pfn_to_page(mfn_to_pfn(PFN_DOWN(s->dma_address)));
> +             xen_dma_unmap_page(page);
> +     }
> +}
> +
> +static void *xen_alloc_coherent(struct device *dev, size_t size,
> +                             dma_addr_t *dma_handle, gfp_t gfp)
> +{
> +     void *ret;
> +     struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
> +     unsigned int order = get_order(size);
> +     unsigned long vstart;
> +     u64 mask;
> +
> +     /* ignore region specifiers */
> +     gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
> +
> +     if (mem) {
> +             int page = bitmap_find_free_region(mem->bitmap, mem->size,
> +                                                  order);

Can you use iommu_area_alloc here?

> +             if (page >= 0) {
> +                     *dma_handle = mem->device_base + (page << PAGE_SHIFT);
> +                     ret = mem->virt_base + (page << PAGE_SHIFT);
> +                     memset(ret, 0, size);
> +                     return ret;
> +             }
> +             if (mem->flags & DMA_MEMORY_EXCLUSIVE)
> +                     return NULL;
> +     }
> +
> +     if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))

DMA_32BIT_MASK

> +             gfp |= GFP_DMA;
> +
> +     vstart = __get_free_pages(gfp, order);
> +     ret = (void *)vstart;
> +
> +     if (dev != NULL && dev->coherent_dma_mask)
> +             mask = dev->coherent_dma_mask;
> +     else
> +             mask = 0xffffffff;

DMA_32BIT_MASK

> +
> +     if (ret != NULL) {
> +             if (xen_create_contiguous_region(vstart, order,
> +                                              fls64(mask)) != 0) {
> +                     free_pages(vstart, order);
> +                     return NULL;
> +             }
> +             memset(ret, 0, size);
> +             *dma_handle = virt_to_machine(ret).maddr;
> +     }
> +     return ret;
> +}
> +
> +static void xen_free_coherent(struct device *dev, size_t size,
> +                      void *vaddr, dma_addr_t dma_addr)
> +{
> +     struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
> +     int order = get_order(size);
> +
> +     if (mem && vaddr >= mem->virt_base &&
> +         vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
> +             int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
> +             bitmap_release_region(mem->bitmap, page, order);

iommu_area_free

> +     } else {
> +             xen_destroy_contiguous_region((unsigned long)vaddr, order);
> +             free_pages((unsigned long)vaddr, order);
> +     }
> +}
> +
> +static dma_addr_t xen_swiotlb_map_single(struct device *dev, phys_addr_t 
> paddr,
> +                                      size_t size, int direction)
> +{
> +     dma_addr_t dma;
> +     BUG_ON(direction == DMA_NONE);
> +
> +     WARN_ON(size == 0);
> +     dma = swiotlb_map_single(dev, phys_to_virt(paddr), size, direction);
> +
> +     flush_write_buffers();
> +     return dma;
> +}
> +
> +static dma_addr_t xen_map_single(struct device *dev, phys_addr_t paddr,
> +                                     size_t size, int direction)
> +{
> +     struct page *page;
> +     dma_addr_t dma;
> +
> +     BUG_ON(direction == DMA_NONE);
> +
> +     WARN_ON(size == 0);
> +     page = pfn_to_page(PFN_DOWN(paddr));
> +
> +     dma = xen_dma_map_page(page) + offset_in_page(paddr);
> +
> +     IOMMU_BUG_ON(address_needs_mapping(dev, dma));
> +     IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size));
> +     flush_write_buffers();
> +     return dma;
> +}
> +
> +static void xen_unmap_single(struct device *dev, dma_addr_t dma_addr,
> +                             size_t size, int direction)
> +{
> +     BUG_ON(direction == DMA_NONE);
> +     xen_dma_unmap_page(pfn_to_page(mfn_to_pfn(PFN_DOWN(dma_addr))));
> +}
> +
> +static struct dma_mapping_ops xen_dma_ops = {
> +     .dma_supported = NULL,
> +
> +     .alloc_coherent = xen_alloc_coherent,
> +     .free_coherent = xen_free_coherent,
> +
> +     .map_single = xen_map_single,
> +     .unmap_single = xen_unmap_single,
> +
> +     .map_sg = xen_map_sg,
> +     .unmap_sg = xen_unmap_sg,
> +
> +     .mapping_error = NULL,
> +
> +     .is_phys = 0,
> +};
> +
> +static struct dma_mapping_ops xen_swiotlb_dma_ops = {
> +     .dma_supported = swiotlb_dma_supported,
> +
> +     .alloc_coherent = xen_alloc_coherent,
> +     .free_coherent = xen_free_coherent,
> +
> +     .map_single = xen_swiotlb_map_single,   /* swiotlb_map_single has a 
> different prototype */
> +     .unmap_single = swiotlb_unmap_single,
> +
> +     .map_sg = swiotlb_map_sg,
> +     .unmap_sg = swiotlb_unmap_sg,
> +
> +     .mapping_error = swiotlb_dma_mapping_error,
> +
> +     .is_phys = 0,
> +};
> +
> +void __init xen_iommu_init(void)
> +{
> +     if (!xen_pv_domain())
> +             return;
> +
> +     printk(KERN_INFO "Xen: Initializing Xen DMA ops\n");
> +
> +     force_iommu = 0;
> +     dma_ops = &xen_dma_ops;
> +
> +     if (swiotlb) {
> +             printk(KERN_INFO "Xen: Enabling DMA fallback to swiotlb\n");
> +             dma_ops = &xen_swiotlb_dma_ops;
> +     }
> +}
> +
> -- 
> 1.6.0.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.