[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Minios-devel] [PATCH v2 12/22] mini-os: add x86 native page table handling



For support of HVMlite don't use mmu_update hypercalls, but write the
page table entries directly.

Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
Reviewed-by: Samuel Thibault <samuel.thibault@xxxxxxxxxxxx>
---
 arch/x86/mm.c         | 147 +++++++++++++++++++++++++++++++++++++-------------
 arch/x86/traps.c      |  10 ++++
 include/x86/arch_mm.h |   4 ++
 include/x86/os.h      |   9 ++++
 4 files changed, 132 insertions(+), 38 deletions(-)

diff --git a/arch/x86/mm.c b/arch/x86/mm.c
index cbb5617..f5248a4 100644
--- a/arch/x86/mm.c
+++ b/arch/x86/mm.c
@@ -123,16 +123,25 @@ void arch_mm_preinit(void *p)
  * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest
  * PFN.
  */
+static pgentry_t pt_prot[PAGETABLE_LEVELS] = {
+    L1_PROT,
+    L2_PROT,
+    L3_PROT,
+#if defined(__x86_64__)
+    L4_PROT,
+#endif
+};
+
 static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, 
                          unsigned long offset, unsigned long level)
 {   
-    pgentry_t *tab = pt_base;
+    pgentry_t *tab;
     unsigned long pt_page = (unsigned long)pfn_to_virt(*pt_pfn); 
-    pgentry_t prot_e, prot_t;
+#ifdef CONFIG_PARAVIRT
     mmu_update_t mmu_updates[1];
     int rc;
+#endif
     
-    prot_e = prot_t = 0;
     DEBUG("Allocating new L%d pt frame for pfn=%lx, "
           "prev_l_mfn=%lx, offset=%lx", 
           level, *pt_pfn, prev_l_mfn, offset);
@@ -140,30 +149,12 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned 
long prev_l_mfn,
     /* We need to clear the page, otherwise we might fail to map it
        as a page table page */
     memset((void*) pt_page, 0, PAGE_SIZE);  
- 
-    switch ( level )
-    {
-    case L1_FRAME:
-        prot_e = L1_PROT;
-        prot_t = L2_PROT;
-        break;
-    case L2_FRAME:
-        prot_e = L2_PROT;
-        prot_t = L3_PROT;
-        break;
-#if defined(__x86_64__)
-    case L3_FRAME:
-        prot_e = L3_PROT;
-        prot_t = L4_PROT;
-        break;
-#endif
-    default:
-        printk("new_pt_frame() called with invalid level number %lu\n", level);
-        do_exit();
-        break;
-    }
 
+    ASSERT(level >= 1 && level <= PAGETABLE_LEVELS);
+
+#ifdef CONFIG_PARAVIRT
     /* Make PFN a page table page */
+    tab = pt_base;
 #if defined(__x86_64__)
     tab = pte_to_virt(tab[l4_table_offset(pt_page)]);
 #endif
@@ -172,7 +163,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned 
long prev_l_mfn,
     mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) + 
         sizeof(pgentry_t) * l1_table_offset(pt_page);
     mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | 
-        (prot_e & ~_PAGE_RW);
+        (pt_prot[level - 1] & ~_PAGE_RW);
     
     if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 )
     {
@@ -184,13 +175,18 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned 
long prev_l_mfn,
     /* Hook the new page table page into the hierarchy */
     mmu_updates[0].ptr =
         ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
-    mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | prot_t;
+    mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT |
+        pt_prot[level];
 
     if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) 
     {
         printk("ERROR: mmu_update failed with rc=%d\n", rc);
         do_exit();
     }
+#else
+    tab = mfn_to_virt(prev_l_mfn);
+    tab[offset] = (*pt_pfn << PAGE_SHIFT) | pt_prot[level];
+#endif
 
     *pt_pfn += 1;
 }
@@ -202,12 +198,14 @@ static void build_pagetable(unsigned long *start_pfn, 
unsigned long *max_pfn)
 {
     unsigned long start_address, end_address;
     unsigned long pfn_to_map, pt_pfn = *start_pfn;
-    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
     pgentry_t *tab = pt_base, page;
     unsigned long pt_mfn = pfn_to_mfn(virt_to_pfn(pt_base));
     unsigned long offset;
+#ifdef CONFIG_PARAVIRT
+    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
     int count = 0;
     int rc;
+#endif
 
     /* Be conservative: even if we know there will be more pages already
        mapped, start the loop at the very beginning. */
@@ -225,6 +223,10 @@ static void build_pagetable(unsigned long *start_pfn, 
unsigned long *max_pfn)
                ((unsigned long)pfn_to_virt(*max_pfn) - 
                 (unsigned long)&_text)>>20);
     }
+#else
+    /* Round up to next 2MB boundary as we are using 2MB pages on HVMlite. */
+    pfn_to_map = (pfn_to_map + L1_PAGETABLE_ENTRIES - 1) &
+                 ~(L1_PAGETABLE_ENTRIES - 1);
 #endif
 
     start_address = (unsigned long)pfn_to_virt(pfn_to_map);
@@ -257,6 +259,7 @@ static void build_pagetable(unsigned long *start_pfn, 
unsigned long *max_pfn)
         pt_mfn = pte_to_mfn(page);
         tab = to_virt(mfn_to_pfn(pt_mfn) << PAGE_SHIFT);
         offset = l2_table_offset(start_address);        
+#ifdef CONFIG_PARAVIRT
         /* Need new L1 pt frame */
         if ( !(tab[offset] & _PAGE_PRESENT) )
             new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
@@ -288,6 +291,12 @@ static void build_pagetable(unsigned long *start_pfn, 
unsigned long *max_pfn)
             count = 0;
         }
         start_address += PAGE_SIZE;
+#else
+        if ( !(tab[offset] & _PAGE_PRESENT) )
+            tab[offset] = (pgentry_t)pfn_to_map << PAGE_SHIFT |
+                          L2_PROT | _PAGE_PSE;
+        start_address += 1UL << L2_PAGETABLE_SHIFT;
+#endif
     }
 
     *start_pfn = pt_pfn;
@@ -302,16 +311,19 @@ static void set_readonly(void *text, void *etext)
     unsigned long start_address =
         ((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK;
     unsigned long end_address = (unsigned long) etext;
-    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
     pgentry_t *tab = pt_base, page;
     unsigned long mfn = pfn_to_mfn(virt_to_pfn(pt_base));
     unsigned long offset;
+    unsigned long page_size = PAGE_SIZE;
+#ifdef CONFIG_PARAVIRT
+    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
     int count = 0;
     int rc;
+#endif
 
     printk("setting %p-%p readonly\n", text, etext);
 
-    while ( start_address + PAGE_SIZE <= end_address )
+    while ( start_address + page_size <= end_address )
     {
         tab = pt_base;
         mfn = pfn_to_mfn(virt_to_pfn(pt_base));
@@ -327,26 +339,34 @@ static void set_readonly(void *text, void *etext)
         mfn = pte_to_mfn(page);
         tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
         offset = l2_table_offset(start_address);        
-        page = tab[offset];
-        mfn = pte_to_mfn(page);
-        tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
+        if ( !(tab[offset] & _PAGE_PSE) )
+        {
+            page = tab[offset];
+            mfn = pte_to_mfn(page);
+            tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
 
-        offset = l1_table_offset(start_address);
+            offset = l1_table_offset(start_address);
+        }
 
         if ( start_address != (unsigned long)&shared_info )
         {
+#ifdef CONFIG_PARAVIRT
             mmu_updates[count].ptr = 
                 ((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
             mmu_updates[count].val = tab[offset] & ~_PAGE_RW;
             count++;
+#else
+            tab[offset] &= ~_PAGE_RW;
+#endif
         }
         else
             printk("skipped %lx\n", start_address);
 
-        start_address += PAGE_SIZE;
+        start_address += page_size;
 
+#ifdef CONFIG_PARAVIRT
         if ( count == L1_PAGETABLE_ENTRIES || 
-             start_address + PAGE_SIZE > end_address )
+             start_address + page_size > end_address )
         {
             rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF);
             if ( rc < 0 )
@@ -356,8 +376,13 @@ static void set_readonly(void *text, void *etext)
             }
             count = 0;
         }
+#else
+        if ( start_address == (1UL << L2_PAGETABLE_SHIFT) )
+            page_size = 1UL << L2_PAGETABLE_SHIFT;
+#endif
     }
 
+#ifdef CONFIG_PARAVIRT
     {
         mmuext_op_t op = {
             .cmd = MMUEXT_TLB_FLUSH_ALL,
@@ -365,6 +390,9 @@ static void set_readonly(void *text, void *etext)
         int count;
         HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF);
     }
+#else
+    write_cr3((unsigned long)pt_base);
+#endif
 }
 
 /*
@@ -394,6 +422,8 @@ static pgentry_t *get_pgt(unsigned long va)
     offset = l2_table_offset(va);
     if ( !(tab[offset] & _PAGE_PRESENT) )
         return NULL;
+    if ( tab[offset] & _PAGE_PSE )
+        return &tab[offset];
     mfn = pte_to_mfn(tab[offset]);
     tab = mfn_to_virt(mfn);
     offset = l1_table_offset(va);
@@ -448,6 +478,9 @@ pgentry_t *need_pgt(unsigned long va)
         new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
     }
     ASSERT(tab[offset] & _PAGE_PRESENT);
+    if ( tab[offset] & _PAGE_PSE )
+        return &tab[offset];
+
     pt_mfn = pte_to_mfn(tab[offset]);
     tab = mfn_to_virt(pt_mfn);
 
@@ -524,8 +557,6 @@ int do_map_frames(unsigned long va,
 {
     pgentry_t *pgt = NULL;
     unsigned long done = 0;
-    unsigned long i;
-    int rc;
 
     if ( !mfns ) 
     {
@@ -539,6 +570,9 @@ int do_map_frames(unsigned long va,
         memset(err, 0x00, n * sizeof(int));
     while ( done < n )
     {
+#ifdef CONFIG_PARAVIRT
+        unsigned long i;
+        int rc;
         unsigned long todo;
 
         if ( err )
@@ -578,6 +612,17 @@ int do_map_frames(unsigned long va,
             }
         }
         done += todo;
+#else
+        if ( !pgt || !(va & L1_MASK) )
+            pgt = need_pgt(va & ~L1_MASK);
+        if ( !pgt )
+            return -ENOMEM;
+
+        ASSERT(!(*pgt & _PAGE_PSE));
+        pgt[l1_table_offset(va)] = (pgentry_t)
+            (((mfns[done * stride] + done * incr) << PAGE_SHIFT) | prot);
+        done++;
+#endif
     }
 
     return 0;
@@ -609,16 +654,21 @@ void *map_frames_ex(const unsigned long *mfns, unsigned 
long n,
 #define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t))
 int unmap_frames(unsigned long va, unsigned long num_frames)
 {
+#ifdef CONFIG_PARAVIRT
     int n = UNMAP_BATCH;
     multicall_entry_t call[n];
     int ret;
     int i;
+#else
+    pgentry_t *pgt;
+#endif
 
     ASSERT(!((unsigned long)va & ~PAGE_MASK));
 
     DEBUG("va=%p, num=0x%lx\n", va, num_frames);
 
     while ( num_frames ) {
+#ifdef CONFIG_PARAVIRT
         if ( n > num_frames )
             n = num_frames;
 
@@ -653,6 +703,17 @@ int unmap_frames(unsigned long va, unsigned long 
num_frames)
             }
         }
         num_frames -= n;
+#else
+        pgt = get_pgt(va);
+        if ( pgt )
+        {
+            ASSERT(!(*pgt & _PAGE_PSE));
+            *pgt = 0;
+            invlpg(va);
+        }
+        va += PAGE_SIZE;
+        num_frames--;
+#endif
     }
     return 0;
 }
@@ -662,14 +723,24 @@ int unmap_frames(unsigned long va, unsigned long 
num_frames)
  */
 static void clear_bootstrap(void)
 {
+#ifdef CONFIG_PARAVIRT
     pte_t nullpte = { };
     int rc;
+#else
+    pgentry_t *pgt;
+#endif
 
     /* Use first page as the CoW zero page */
     memset(&_text, 0, PAGE_SIZE);
     mfn_zero = virt_to_mfn((unsigned long) &_text);
+#ifdef CONFIG_PARAVIRT
     if ( (rc = HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG)) )
         printk("Unable to unmap NULL page. rc=%d\n", rc);
+#else
+    pgt = get_pgt((unsigned long)&_text);
+    *pgt = 0;
+    invlpg((unsigned long)&_text);
+#endif
 }
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/traps.c b/arch/x86/traps.c
index 2d3222d..aa17da3 100644
--- a/arch/x86/traps.c
+++ b/arch/x86/traps.c
@@ -121,7 +121,9 @@ void page_walk(unsigned long virt_address)
 static int handle_cow(unsigned long addr) {
         pgentry_t *tab = pt_base, page;
        unsigned long new_page;
+#ifdef CONFIG_PARAVIRT
        int rc;
+#endif
 
 #if defined(__x86_64__)
         page = tab[l4_table_offset(addr)];
@@ -137,6 +139,8 @@ static int handle_cow(unsigned long addr) {
         page = tab[l2_table_offset(addr)];
        if (!(page & _PAGE_PRESENT))
            return 0;
+       if ( page & _PAGE_PSE )
+           return 0;
         tab = pte_to_virt(page);
         
         page = tab[l1_table_offset(addr)];
@@ -149,12 +153,18 @@ static int handle_cow(unsigned long addr) {
        new_page = alloc_pages(0);
        memset((void*) new_page, 0, PAGE_SIZE);
 
+#ifdef CONFIG_PARAVIRT
        rc = HYPERVISOR_update_va_mapping(addr & PAGE_MASK, 
__pte(virt_to_mach(new_page) | L1_PROT), UVMF_INVLPG);
        if (!rc)
                return 1;
 
        printk("Map zero page to %lx failed: %d.\n", addr, rc);
        return 0;
+#else
+       tab[l1_table_offset(addr)] = virt_to_mach(new_page) | L1_PROT;
+       invlpg(addr);
+       return 1;
+#endif
 }
 
 static void do_stack_walk(unsigned long frame_base)
diff --git a/include/x86/arch_mm.h b/include/x86/arch_mm.h
index 28ab406..e0ae552 100644
--- a/include/x86/arch_mm.h
+++ b/include/x86/arch_mm.h
@@ -78,6 +78,8 @@
 #define L2_PAGETABLE_ENTRIES    512
 #define L3_PAGETABLE_ENTRIES    4
 
+#define PAGETABLE_LEVELS        3
+
 #define PADDR_BITS              44
 #define PADDR_MASK              ((1ULL << PADDR_BITS)-1)
 
@@ -110,6 +112,8 @@ typedef uint64_t pgentry_t;
 #define L3_PAGETABLE_ENTRIES    512
 #define L4_PAGETABLE_ENTRIES    512
 
+#define PAGETABLE_LEVELS        4
+
 /* These are page-table limitations. Current CPUs support only 40-bit phys. */
 #define PADDR_BITS              52
 #define VADDR_BITS              48
diff --git a/include/x86/os.h b/include/x86/os.h
index 8ccff21..e118b91 100644
--- a/include/x86/os.h
+++ b/include/x86/os.h
@@ -206,6 +206,15 @@ static inline int irqs_disabled(void)
  */
 typedef struct { volatile int counter; } atomic_t;
 
+static inline void write_cr3(unsigned long cr3)
+{
+    asm volatile( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
+}
+
+static inline void invlpg(unsigned long va)
+{
+    asm volatile ( "invlpg %0": : "m" (*(const char *)(va)) : "memory" );
+}
 
 /************************** i386 *******************************/
 #ifdef __INSIDE_MINIOS__
-- 
2.6.6


_______________________________________________
Minios-devel mailing list
Minios-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/cgi-bin/mailman/listinfo/minios-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.