[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC Patch] Support for making an E820 PCI hole in toolstack (xl + xm)



On Mon, Nov 15, 2010 at 07:57:47PM +0000, Keir Fraser wrote:
> On 15/11/2010 19:32, "Jeremy Fitzhardinge" <jeremy@xxxxxxxx> wrote:
> 
> >> Or, is there much disadvantage, to having a static really big PCI hole? Say
> >> starting at 1GB? The advantage of this would be the ability to hotplug PCI
> >> devices to a domU even across save/restore/migrate -- this may not work so
> >> well if you commit yourself to the hole size of the original host, and the
> >> restore/migrate target host has a bigger hole!
> > 
> > Well, the other question is whether the devices have to have the same
> > pfn as mfn within the hole.  We're emulating the PCI config space anyway
> > - couldn't we stick the passthrough PCI space at 3G regardless of where
> > it is on the real hardware?

Your thinking is that on the Linux side, any of the pfns that are within
those System RAM gaps (irregardless if they are above or below 4GB) would
be consultated during PTE creation/lookup (xen_pte_val..). 

And if those PFNs are within those System RAM gaps, we would store the 
MFN in the P2M list and instead of doing:
   val = ((pteval_t)pfn << PAGE_SHIFT) | flags

we would actually do mfn = pfn_to_mfn(pfn) and stick on the _PAGE_IOMAP flag.

And  example patch (compiled tested, not tested any other way) attached at the
end of this email.

How does that work on the Xen side? Does the hypervisor depend on the pages
that belong to the DOM_IO domain to have a INVALID_MFN value in the mfn_list?

We do make the PTE that refer to physical devices to be the DOM_IO domain..


> 
> Well, I don't know. It sounds pretty sensible to me. :-)
> 
> Certain virtualisation feature sdisappearing after a save/restore/migrate --
> or worsse, becoming unreliable -- would be a bit sad.

So having the option of the PCI hole being passed through, and giving
the tools the value (pci_hole) would mean we could migrate an SR-IOV type
device from one machine to another. Constructing the PCI hole using the
XENMEM_machine_memory_map could generate different E820 for the two guests, 
which
would be indeed a bit sad.


--- the patch ---

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 50dc626..96a08ef 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -699,7 +699,7 @@ static bool xen_page_pinned(void *ptr)
 
 static bool xen_iomap_pte(pte_t pte)
 {
-       return pte_flags(pte) & _PAGE_IOMAP;
+       return xen_pfn_is_pci(pte_mfn(pte));
 }
 
 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
@@ -801,11 +801,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, 
pgprot_t flags)
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, pte_t pteval)
 {
-       if (xen_iomap_pte(pteval)) {
-               xen_set_iomap_pte(ptep, pteval);
-               goto out;
-       }
-
        ADD_STATS(set_pte_at, 1);
 //     ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
        ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -889,19 +884,6 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
        return val;
 }
 
-static pteval_t iomap_pte(pteval_t val)
-{
-       if (val & _PAGE_PRESENT) {
-               unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
-               pteval_t flags = val & PTE_FLAGS_MASK;
-
-               /* We assume the pte frame number is a MFN, so
-                  just use it as-is. */
-               val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
-       }
-
-       return val;
-}
 
 pteval_t xen_pte_val(pte_t pte)
 {
@@ -913,8 +895,8 @@ pteval_t xen_pte_val(pte_t pte)
                pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
        }
 
-       if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
-               return pteval;
+       if (xen_pfn_is_pci(pte_mfn(pte)))
+               pteval |= _PAGE_IOMAP;
 
        return pte_mfn_to_pfn(pteval);
 }
@@ -974,13 +956,14 @@ pte_t xen_make_pte(pteval_t pte)
         * mappings are just dummy local mappings to keep other
         * parts of the kernel happy.
         */
-       if (unlikely(pte & _PAGE_IOMAP) &&
-           (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
-               pte = iomap_pte(pte);
-       } else {
+       if ((unlikely(pte & _PAGE_IOMAP) &&
+           (xen_initial_domain() || addr >= ISA_END_ADDRESS)) ||
+           (unlikely(xen_pfn_is_pci(PFN_UP(addr)))))
+               pte |=  _PAGE_IOMAP;
+       else
                pte &= ~_PAGE_IOMAP;
-               pte = pte_pfn_to_mfn(pte);
-       }
+
+       pte = pte_pfn_to_mfn(pte);
 
        return native_make_pte(pte);
 }
@@ -1037,10 +1020,8 @@ void xen_set_pud(pud_t *ptr, pud_t val)
 
 void xen_set_pte(pte_t *ptep, pte_t pte)
 {
-       if (xen_iomap_pte(pte)) {
+       if (xen_iomap_pte(pte))
                xen_set_iomap_pte(ptep, pte);
-               return;
-       }
 
        ADD_STATS(pte_update, 1);
 //     ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
@@ -1058,10 +1039,8 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
 #ifdef CONFIG_X86_PAE
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
-       if (xen_iomap_pte(pte)) {
+       if (xen_iomap_pte(pte))
                xen_set_iomap_pte(ptep, pte);
-               return;
-       }
 
        set_64bit((u64 *)ptep, native_pte_val(pte));
 }
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 5a1f22d..bb424e3 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -196,6 +196,31 @@ unsigned long xen_find_max_pfn(void)
        xen_raw_printk("E820 max_pfn = %ld (nr_pages: %ld)\n", max_pfn, 
xen_start_info->nr_pages);
        return max_pfn;
 }
+
+int xen_pfn_is_pci(unsigned long pfn)
+{
+       static struct e820entry map[E820MAX] __initdata;
+       int rc, op, i;
+       struct xen_memory_map memmap;
+       unsigned long long addr = PFN_PHYS(pfn);
+       memmap.nr_entries = E820MAX;
+       set_xen_guest_handle(memmap.buffer, map);
+
+       op = xen_initial_domain() ?
+               XENMEM_machine_memory_map :
+               XENMEM_memory_map;
+       rc = HYPERVISOR_memory_op(op, &memmap);
+       BUG_ON(rc);
+
+       for (i = 0; i < memmap.nr_entries; i++) {
+               unsigned long long end = map[i].addr + map[i].size;
+               if (map[i].type != E820_RAM)
+                       continue;
+               if (addr >= map[i].addr && addr <= end)
+                       return 0;
+       }
+       return 1;
+}
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index eee2045..f859b04 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -31,4 +31,6 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 extern phys_addr_t xen_extra_mem_start;
 unsigned long xen_find_max_pfn(void);
 
+int xen_pfn_is_pci(unsigned long pfn);
+
 #endif /* INCLUDE_XEN_OPS_H */

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.