Xen project Mailing List

Re: [Xen-devel] PATCH: Hugepage support for Domains booting with 4KB pages

To: Keshav Darak <keshav_darak@xxxxxxxxx>

From: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>

Date: Tue, 22 Mar 2011 12:49:13 -0400

Cc: jeremy@xxxxxxxx, xen-devel@xxxxxxxxxxxxxxxxxxx, keir@xxxxxxx

Delivery-date: Tue, 22 Mar 2011 09:50:16 -0700

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

On Sun, Mar 20, 2011 at 03:34:51PM -0700, Keshav Darak wrote: > We have implemented hugepage support for guests in following manner > > In > our implementation we added a parameter hugepage_num which is specified > in the config file of the DomU. It is the number of hugepages that the > guest is guaranteed to receive whenever the kernel asks for hugepage by > using its boot time parameter or reserving after booting (eg. Using echo > XX > /proc/sys/vm/nr_hugepages). During creation of the domain we > reserve MFN's for these hugepages and store them in the list. The There is bootup option for normal Linux kernels to set that up. Was that something you could use? In regards to the patch, I've some questions.. >diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h >index bf88684..7707e21 100644 >--- a/arch/x86/include/asm/hugetlb.h >+++ b/arch/x86/include/asm/hugetlb.h >@@ -98,6 +98,14 @@ static inline int huge_ptep_set_access_flags(struct >vm_area_struct *vma, > > return changed; > } >+#ifdef CONFIG_XEN >+struct page* allocate_hugepage(int); >+#else >+static inline struct page * allocate_hugepage(int order) >+{ >+ return NULL; >+} >+#endif So it looks like you are exposing the allocate_hugepage to be out of the hotplug memory. Could you do this via a pvops structure instead? You should also seperate this functionality as its own patch - ie, expose the allocate_hugepage. > > static inline int arch_prepare_hugepage(struct page *page) > { >index f46c340..00c489a 100644 >--- a/arch/x86/mm/hugetlbpage.c >+++ b/arch/x86/mm/hugetlbpage.c >@@ -147,8 +147,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, > pte = (pte_t *) pmd_alloc(mm, pud, addr); > } > } >- BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); >- >+ BUG_ON(pte && !pte_none(*pte) && !((*pte).pte & (_AT(pteval_t, 1)<<7))); Ugh. That is horrible. why can't you use 'pte_huge' ? Is it b/c of this * (We should never see kernel mappings with _PAGE_PSE set, * but we could see hugetlbfs mappings, I think.). */ if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { in xen/mmu.c? If so, have you thought about removing the warnings and/or changing the logic there instead? > return pte; > } > >diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c >index 070f138..c1db610 100644 >--- a/arch/x86/xen/enlighten.c >+++ b/arch/x86/xen/enlighten.c >@@ -58,6 +58,7 @@ > #include <asm/reboot.h> > #include <asm/stackprotector.h> > #include <asm/hypervisor.h> >+#include <linux/list.h> > > #include "xen-ops.h" > #include "mmu.h" >@@ -76,6 +77,9 @@ EXPORT_SYMBOL(machine_to_phys_mapping); > unsigned int machine_to_phys_order; > EXPORT_SYMBOL(machine_to_phys_order); > >+struct list_head xen_hugepfn_list; >+EXPORT_SYMBOL_GPL(xen_hugepfn_list); >+ Hmm, a list, but no locking? There is no need for a spinlock? > struct start_info *xen_start_info; > EXPORT_SYMBOL_GPL(xen_start_info); > >diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c >index 32a1c65..fa9fa6c 100644 >--- a/arch/x86/xen/mmu.c >+++ b/arch/x86/xen/mmu.c >@@ -44,6 +44,7 @@ > #include <linux/bug.h> > #include <linux/vmalloc.h> > #include <linux/module.h> >+#include <linux/bootmem.h> > > #include <asm/pgtable.h> > #include <asm/tlbflush.h> >@@ -2776,5 +2784,79 @@ static int __init xen_mmu_debugfs(void) > return 0; > } > fs_initcall(xen_mmu_debugfs); >+//a2k2 >+extern struct list_head xen_hugepfn_list; >+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; >+ >+static void scrub_page(struct page *page) >+{ >+#ifdef CONFIG_XEN_SCRUB_PAGES >+ clear_highpage(page); >+#endif You don't seem to use this anywhere.. >+} >+#ifdef CONFIG_HIGHMEM >+#define inc_totalhigh_pages() (totalhigh_pages++) >+#define dec_totalhigh_pages() (totalhigh_pages--) so what do you do with those? I seem incremented, but nothing seems to use them. >+#else >+#define inc_totalhigh_pages() do {} while(0) >+#define dec_totalhigh_pages() do {} while(0) >+#endif >+ >+struct page* allocate_hugepage(int order){ >+ unsigned long mfn, i, j; You seem to be definiing 'i' but not using it? >+ struct page *page; >+ long rc,cnt; You should be using 'int' for rc. >+ unsigned long pfn; >+ struct xen_memory_reservation reservation = { >+ .address_bits = 0, >+ .domid = DOMID_SELF >+ }; >+ if(list_empty(&xen_hugepfn_list)){ >+ return NULL; >+ } >+ page = list_entry(xen_hugepfn_list.next, struct page, lru); >+ list_del(&page->lru); >+ frame_list[0] = page_to_pfn(page); >+ >+ set_xen_guest_handle(reservation.extent_start, frame_list); >+ reservation.nr_extents = 1; >+ reservation.extent_order = 9; shouldn't this 'order' instead of 9'? >+ cnt=1<<order; >+ rc = HYPERVISOR_memory_op(XENMEM_populate_hugepage, &reservation); >+ if (rc <= 0) >+ { >+ printk("a2k2: could not allocate hugepage\n"); Please run 'script/checkpatch.pl' before posting. It will show that this a not good. >+ goto out1; >+ } >+ pfn=page_to_pfn(page); >+ if(rc) >+ { >+ mfn = frame_list[0]; >+ for (j = 0; j < cnt; j++, pfn++, mfn++) { >+ set_phys_to_machine(pfn, mfn); >+ if (pfn < max_low_pfn) { ... and what if 'pfn > max_low_pfn' ? What should we do then? >+ int ret; >+ ret = HYPERVISOR_update_va_mapping( >+ (unsigned long)__va(pfn << >PAGE_SHIFT), Use the PFN_DOWN macro please. >+ mfn_pte(mfn, PAGE_KERNEL), >+ 0); >+ BUG_ON(ret); Ugh. what if you just stopped the allocation and returned NULL instead? >+ } >+ } >+ >+ ClearPageReserved(page); >+ atomic_set(&page->_count,1); Ugh.. Is that correct? Can you explain why you need to do that? >+ } What if rc is zero? Should we ocntinue on with page? >+ if (PageHighMem(page)) { >+ for(i=0;i<512;i++) Shouldn't you determine the value 512 from the 'order'? >+ inc_totalhigh_pages(); >+ } >+ totalram_pages+=512; Ditto. >+ __SetPageHead(page); >+ set_compound_order(page,order); >+ return page; >+ out1: shouldn't you return the page back on the xen_hugepfn_list? >+ return NULL; >+} > > #endif /* CONFIG_XEN_DEBUG_FS */ >diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c >index 1a1934a..c1781e7 100644 >--- a/arch/x86/xen/setup.c >+++ b/arch/x86/xen/setup.c >@@ -138,6 +138,40 @@ static unsigned long __init >xen_return_unused_memory(unsigned long max_pfn, > return released; > } > >+extern struct list_head xen_hugepfn_list; >+static int __init reserve_hugepage_range(void) >+{ >+ phys_addr_t temp,hugemem_start; >+ unsigned long i,ret; Is 'unsigned long' the right type for 'ret'? >+ struct page *page; >+ static unsigned long frame_list[1]; >+ struct xen_memory_reservation reservation = { >+ .address_bits = 0, >+ .extent_order = 0, >+ .nr_extents = 1, >+ .domid = DOMID_SELF >+ }; >+ if(!xen_pv_domain()) >+ return -ENODEV; >+ set_xen_guest_handle(reservation.extent_start, frame_list); >+ hugemem_start=PFN_UP(xen_extra_mem_start); >+ ret = HYPERVISOR_memory_op(20, &reservation); 20? There is no #define for this? You are not checking to see if the hypercall failed. >+ printk("a2k2: num of hugepages found =%lu\n",ret); >+ temp=PFN_PHYS(PFN_UP(xen_extra_mem_start)+ret*512); There has to be a better way of doing this. You are assuming that the hugepage is always 2MB. What if it is 16MB? Is there a way you can use any definitions of the architecture for this? >+ >+ xen_extra_mem_start=temp; >+ xen_extra_mem_size-=ret*2*1024*1024; Ditto. You are assuming it is 2MB. I couldbe bigger (or smaller) depending on the architecture. >+ INIT_LIST_HEAD(&xen_hugepfn_list); >+ for(i=0;i<ret;i++) >+ { >+ page=pfn_to_page(hugemem_start); >+ list_add_tail(&page->lru,&xen_hugepfn_list); >+ hugemem_start+=512; >+ } >+ return 0; >+} >+subsys_initcall(reserve_hugepage_range); >+ > /** > * machine_specific_memory_setup - Hook for machine specific memory setup. > **/ >diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h >index aa4e368..168dd2b 100644 >--- a/include/xen/interface/memory.h >+++ b/include/xen/interface/memory.h >@@ -19,6 +19,7 @@ > #define XENMEM_increase_reservation 0 > #define XENMEM_decrease_reservation 1 > #define XENMEM_populate_physmap 6 >+#define XENMEM_populate_hugepage 19 > struct xen_memory_reservation { > > /* >diff --git a/mm/hugetlb.c b/mm/hugetlb.c >index f5a106e..7e38f73 100644 >--- a/mm/hugetlb.c >+++ b/mm/hugetlb.c >@@ -18,6 +18,7 @@ > #include <linux/mutex.h> > #include <linux/bootmem.h> > #include <linux/sysfs.h> >+#include <xen/xen.h> > > #include <asm/page.h> > #include <asm/pgtable.h> >@@ -600,17 +620,19 @@ int PageHuge(struct page *page) > return dtor == free_huge_page; > } > >+ > static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) > { > struct page *page; >- > if (h->order >= MAX_ORDER) > return NULL; >- >- page = alloc_pages_exact_node(nid, >- htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| >- __GFP_REPEAT|__GFP_NOWARN, >- huge_page_order(h)); >+ if(!xen_pv_domain()) Ugh. That is not the right way. You should be looking at using the pvops struct interface so that you can over-write the baremetal default implementation. >+ page = alloc_pages_exact_node(nid, >+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| >+ >__GFP_REPEAT|__GFP_NOWARN, >+ huge_page_order(h)); >+ else >+ page=allocate_hugepage(huge_page_order(h)); > if (page) { > if (arch_prepare_hugepage(page)) { > __free_pages(page, huge_page_order(h)); > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.