[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] PATCH: Hugepage support for Domains booting with 4KB pages



On Sun, Mar 20, 2011 at 03:34:51PM -0700, Keshav Darak wrote:
> We have implemented hugepage support for guests in following manner
> 
> In
>  our implementation we added a parameter hugepage_num which is specified
>  in the config file of the DomU. It is the number of hugepages that the 
> guest is guaranteed to receive whenever the kernel asks for hugepage by 
> using its boot time parameter or reserving after booting (eg. Using echo
>  XX > /proc/sys/vm/nr_hugepages). During creation of the domain we 
> reserve MFN's for these hugepages and store them in the list. The 

There is bootup option for normal Linux kernels to set that up. Was
that something you could use?


In regards to the patch, I've some questions..

>diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
>index bf88684..7707e21 100644
>--- a/arch/x86/include/asm/hugetlb.h
>+++ b/arch/x86/include/asm/hugetlb.h
>@@ -98,6 +98,14 @@ static inline int huge_ptep_set_access_flags(struct 
>vm_area_struct *vma,
> 
>       return changed;
> }
>+#ifdef CONFIG_XEN
>+struct page* allocate_hugepage(int);
>+#else
>+static inline struct page * allocate_hugepage(int order)
>+{
>+        return NULL;
>+}
>+#endif

So it looks like you are exposing the allocate_hugepage to be out
of the hotplug memory. Could you do this via a pvops structure instead?
You should also seperate this functionality as its own patch - ie,
expose the allocate_hugepage.
> 
> static inline int arch_prepare_hugepage(struct page *page)
> {
>index f46c340..00c489a 100644
>--- a/arch/x86/mm/hugetlbpage.c
>+++ b/arch/x86/mm/hugetlbpage.c
>@@ -147,8 +147,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
>                       pte = (pte_t *) pmd_alloc(mm, pud, addr);
>               }
>       }
>-      BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
>-
>+      BUG_ON(pte && !pte_none(*pte) && !((*pte).pte & (_AT(pteval_t, 1)<<7)));

Ugh. That is horrible.

why can't you use 'pte_huge' ? Is it b/c of this
 * (We should never see kernel mappings with _PAGE_PSE set,
 * but we could see hugetlbfs mappings, I think.).
 */
if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { in xen/mmu.c? 

If so, have you thought about removing the warnings and/or changing
the logic there instead?

>       return pte;
> }
> 
>diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>index 070f138..c1db610 100644
>--- a/arch/x86/xen/enlighten.c
>+++ b/arch/x86/xen/enlighten.c
>@@ -58,6 +58,7 @@
> #include <asm/reboot.h>
> #include <asm/stackprotector.h>
> #include <asm/hypervisor.h>
>+#include <linux/list.h>
> 
> #include "xen-ops.h"
> #include "mmu.h"
>@@ -76,6 +77,9 @@ EXPORT_SYMBOL(machine_to_phys_mapping);
> unsigned int   machine_to_phys_order;
> EXPORT_SYMBOL(machine_to_phys_order);
> 
>+struct list_head xen_hugepfn_list;
>+EXPORT_SYMBOL_GPL(xen_hugepfn_list);
>+

Hmm, a list, but no locking? There is no need for a spinlock?

> struct start_info *xen_start_info;
> EXPORT_SYMBOL_GPL(xen_start_info);
> 
>diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>index 32a1c65..fa9fa6c 100644
>--- a/arch/x86/xen/mmu.c
>+++ b/arch/x86/xen/mmu.c
>@@ -44,6 +44,7 @@
> #include <linux/bug.h>
> #include <linux/vmalloc.h>
> #include <linux/module.h>
>+#include <linux/bootmem.h>
> 
> #include <asm/pgtable.h>
> #include <asm/tlbflush.h>
>@@ -2776,5 +2784,79 @@ static int __init xen_mmu_debugfs(void)
>       return 0;
> }
> fs_initcall(xen_mmu_debugfs);
>+//a2k2
>+extern struct list_head xen_hugepfn_list;
>+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
>+
>+static void scrub_page(struct page *page)
>+{
>+#ifdef CONFIG_XEN_SCRUB_PAGES
>+              clear_highpage(page);
>+#endif

You don't seem to use this anywhere..
>+}
>+#ifdef CONFIG_HIGHMEM
>+#define inc_totalhigh_pages() (totalhigh_pages++)
>+#define dec_totalhigh_pages() (totalhigh_pages--)

so what do you do with those? I seem incremented, but
nothing seems to use them.
>+#else
>+#define inc_totalhigh_pages() do {} while(0)
>+#define dec_totalhigh_pages() do {} while(0)
>+#endif
>+
>+struct page* allocate_hugepage(int order){
>+    unsigned long  mfn, i, j;

You seem to be definiing 'i' but not using it?

>+    struct page   *page;
>+    long           rc,cnt;

You should be using 'int' for rc.

>+    unsigned long pfn;
>+    struct xen_memory_reservation reservation = {
>+      .address_bits = 0,
>+      .domid        = DOMID_SELF
>+    };
>+    if(list_empty(&xen_hugepfn_list)){
>+        return NULL; 
>+    } 
>+    page = list_entry(xen_hugepfn_list.next, struct page, lru);
>+    list_del(&page->lru);
>+    frame_list[0] = page_to_pfn(page);
>+
>+    set_xen_guest_handle(reservation.extent_start, frame_list);
>+    reservation.nr_extents = 1;
>+    reservation.extent_order = 9;
shouldn't this 'order' instead of 9'?
>+    cnt=1<<order;
>+    rc = HYPERVISOR_memory_op(XENMEM_populate_hugepage, &reservation);
>+    if (rc <= 0)
>+    {
>+      printk("a2k2: could not allocate hugepage\n"); 

Please run 'script/checkpatch.pl' before posting. It will show that this a not 
good.
>+      goto out1;
>+    }
>+    pfn=page_to_pfn(page);
>+    if(rc)
>+    {
>+      mfn = frame_list[0];
>+      for (j = 0; j < cnt; j++, pfn++, mfn++) {
>+          set_phys_to_machine(pfn, mfn);
>+          if (pfn < max_low_pfn) {

... and what if 'pfn > max_low_pfn' ? What should we
do then?
>+              int ret;
>+              ret = HYPERVISOR_update_va_mapping(
>+                                             (unsigned long)__va(pfn << 
>PAGE_SHIFT),

Use the PFN_DOWN macro please.
>+                                             mfn_pte(mfn, PAGE_KERNEL),
>+                                             0);
>+              BUG_ON(ret);

Ugh. what if you just stopped the allocation and returned NULL instead?

>+          }
>+      }
>+
>+      ClearPageReserved(page);
>+      atomic_set(&page->_count,1);

Ugh.. Is that correct? Can you explain why you need to do that?

>+    }

What if rc is zero? Should we ocntinue on with page?
>+    if (PageHighMem(page)) {
>+        for(i=0;i<512;i++)

Shouldn't you determine the value 512 from the 'order'?
>+          inc_totalhigh_pages();
>+    }
>+    totalram_pages+=512;

Ditto.
>+    __SetPageHead(page);
>+    set_compound_order(page,order);
>+    return page;
>+ out1:

shouldn't you return the page back on the xen_hugepfn_list?

>+    return NULL;
>+}
> 
> #endif        /* CONFIG_XEN_DEBUG_FS */
>diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
>index 1a1934a..c1781e7 100644
>--- a/arch/x86/xen/setup.c
>+++ b/arch/x86/xen/setup.c
>@@ -138,6 +138,40 @@ static unsigned long __init 
>xen_return_unused_memory(unsigned long max_pfn,
>       return released;
> }
> 
>+extern struct list_head xen_hugepfn_list;
>+static int __init reserve_hugepage_range(void)
>+{
>+    phys_addr_t temp,hugemem_start;
>+    unsigned long i,ret;

Is 'unsigned long' the right type for 'ret'?

>+    struct page *page;
>+    static unsigned long frame_list[1];
>+    struct xen_memory_reservation reservation = {
>+        .address_bits = 0,
>+       .extent_order = 0,
>+       .nr_extents   = 1,
>+       .domid        = DOMID_SELF
>+    };
>+    if(!xen_pv_domain())
>+       return -ENODEV;
>+    set_xen_guest_handle(reservation.extent_start, frame_list);
>+    hugemem_start=PFN_UP(xen_extra_mem_start);
>+    ret = HYPERVISOR_memory_op(20, &reservation);

20? There is no #define for this?
You are not checking to see if the hypercall failed.


>+    printk("a2k2: num of hugepages found =%lu\n",ret);
>+    temp=PFN_PHYS(PFN_UP(xen_extra_mem_start)+ret*512);
There has to be a better way of doing this.

You are assuming that the hugepage is always 2MB. What if it is 16MB?
Is there a way you can use any definitions of the architecture for this?

>+
>+    xen_extra_mem_start=temp;
>+    xen_extra_mem_size-=ret*2*1024*1024;

Ditto. You are assuming it is 2MB. I couldbe bigger (or smaller)
depending on the architecture.

>+    INIT_LIST_HEAD(&xen_hugepfn_list);
>+    for(i=0;i<ret;i++)
>+    {
>+        page=pfn_to_page(hugemem_start);
>+        list_add_tail(&page->lru,&xen_hugepfn_list);
>+        hugemem_start+=512;
>+    }
>+    return 0;
>+}
>+subsys_initcall(reserve_hugepage_range);
>+
> /**
>  * machine_specific_memory_setup - Hook for machine specific memory setup.
>  **/
>diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
>index aa4e368..168dd2b 100644
>--- a/include/xen/interface/memory.h
>+++ b/include/xen/interface/memory.h
>@@ -19,6 +19,7 @@
> #define XENMEM_increase_reservation 0
> #define XENMEM_decrease_reservation 1
> #define XENMEM_populate_physmap     6
>+#define XENMEM_populate_hugepage    19
> struct xen_memory_reservation {
> 
>     /*
>diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>index f5a106e..7e38f73 100644
>--- a/mm/hugetlb.c
>+++ b/mm/hugetlb.c
>@@ -18,6 +18,7 @@
> #include <linux/mutex.h>
> #include <linux/bootmem.h>
> #include <linux/sysfs.h>
>+#include <xen/xen.h>
> 
> #include <asm/page.h>
> #include <asm/pgtable.h>
>@@ -600,17 +620,19 @@ int PageHuge(struct page *page)
>       return dtor == free_huge_page;
> }
> 
>+ 
> static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
> {
>       struct page *page;
>-
>       if (h->order >= MAX_ORDER)
>               return NULL;
>-
>-      page = alloc_pages_exact_node(nid,
>-              htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
>-                                              __GFP_REPEAT|__GFP_NOWARN,
>-              huge_page_order(h));
>+       if(!xen_pv_domain())

Ugh. That is not the right way. You should be looking at using the pvops
struct interface so that you can over-write the baremetal default
implementation.

>+              page = alloc_pages_exact_node(nid,
>+                      htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
>+                                                      
>__GFP_REPEAT|__GFP_NOWARN,
>+                      huge_page_order(h));
>+      else
>+              page=allocate_hugepage(huge_page_order(h));
>       if (page) {
>               if (arch_prepare_hugepage(page)) {
>                       __free_pages(page, huge_page_order(h));
>


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.