[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v5 5/6] xen/arm: Implement hypercall for dirty page tracing



Add hypercall (shadow op: enable/disable and clean/peek dirtied page bitmap).

It consists of two parts: dirty page detecting and saving.

For detecting, we setup the guest p2m's leaf PTE read-only and whenever the guest

tries to write something, permission fault happens and traps into xen.

The permission-faulted GPA should be saved for the toolstack (when it wants to see

which pages are dirtied). For this purpose, we temporarily save the GPAs into bitmap.

 

Changes from v4:

1. For temporary saving dirty pages, use bitmap rather than linked list.

2. Setup the p2m's second level page as read-write in the view of xen's memory access.

   It happens in p2m_create_table function.

 

Signed-off-by: Jaeyong Yoo <jaeyong.yoo@xxxxxxxxxxx>

---

xen/arch/arm/domain.c           |  14 +++

xen/arch/arm/domctl.c           |   9 ++

xen/arch/arm/mm.c               | 103 +++++++++++++++++++-

xen/arch/arm/p2m.c              | 206 ++++++++++++++++++++++++++++++++++++++++

xen/arch/arm/traps.c            |   9 ++

xen/include/asm-arm/domain.h    |   7 ++

xen/include/asm-arm/mm.h        |   7 ++

xen/include/asm-arm/p2m.h       |   4 +

xen/include/asm-arm/processor.h |   2 +

9 files changed, 360 insertions(+), 1 deletion(-)

 

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c

index c0b5dd8..0a32301 100644

--- a/xen/arch/arm/domain.c

+++ b/xen/arch/arm/domain.c

@@ -215,6 +215,12 @@ static void ctxt_switch_to(struct vcpu *n)

     WRITE_SYSREG(hcr, HCR_EL2);

     isb();

 

+    /* for dirty-page tracing

+     * XXX: how do we consider SMP case?

+     */

+    if ( n->domain->arch.dirty.mode )

+        restore_vlpt(n->domain);

+

     /* This is could trigger an hardware interrupt from the virtual

      * timer. The interrupt needs to be injected into the guest. */

     virt_timer_restore(n);

@@ -509,11 +515,19 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)

    /* Default the virtual ID to match the physical */

     d->arch.vpidr = boot_cpu_data.midr.bits;

 

+    /* init for dirty-page tracing */

+    d->arch.dirty.count = 0;

+    d->arch.dirty.mode = 0;

+    spin_lock_init(&d->arch.dirty.lock);

+

     d->arch.dirty.second_lvl_start = 0;

     d->arch.dirty.second_lvl_end = 0;

     d->arch.dirty.second_lvl[0] = NULL;

     d->arch.dirty.second_lvl[1] = NULL;

 

+    memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap));

+    d->arch.dirty.bitmap_pages = 0;

+

     clear_page(d->shared_info);

     share_xen_page_with_guest(

         virt_to_page(d->shared_info), d, XENSHARE_writable);

diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c

index cb38e59..eb74225 100644

--- a/xen/arch/arm/domctl.c

+++ b/xen/arch/arm/domctl.c

@@ -93,6 +93,15 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,

             xfree(c.data);

     }

     break;

+    case XEN_DOMCTL_shadow_op:

+    {

+        domain_pause(d);

+        ret = dirty_mode_op(d, &domctl->u.shadow_op);

+        domain_unpause(d);

+

+        copyback = 1;

+    }

+    break;

 

     default:

         return -EINVAL;

diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c

index bf13993..d5a0a11 100644

--- a/xen/arch/arm/mm.c

+++ b/xen/arch/arm/mm.c

@@ -845,7 +845,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long e)

     create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);

}

 

-enum mg { mg_clear, mg_ro, mg_rw, mg_rx };

static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg)

{

     lpae_t pte;

@@ -1320,6 +1319,60 @@ int is_iomem_page(unsigned long mfn)

  *  xen: arm: 64-bit guest support and domU FDT autogeneration

  *  will be upstreamed.

  */

+

+static inline void mark_dirty_bitmap(struct domain *d, paddr_t addr)

+{

+    paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;

+    int bit_index = PFN_DOWN(addr - ram_base);

+    int page_index = bit_index >> (PAGE_SHIFT + 3);

+    int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1);

+

+    set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]);

+}

+

+/* routine for dirty-page tracing

+ *

+ * On first write, it page faults, its entry is changed to read-write,

+ * and on retry the write succeeds.

+ *

+ * for locating p2m of the faulting entry, we use virtual-linear page table.

+ * returns zero if addr is not valid or dirty mode is not set

+ */

+int handle_page_fault(struct domain *d, paddr_t addr)

+{

+

+    lpae_t *vlp2m_pte = 0;

+    paddr_t gma_start = 0;

+    paddr_t gma_end = 0;

+

+    if ( !d->arch.dirty.mode ) return 0;

+    get_gma_start_end(d, &gma_start, &gma_end);

+

+    /* Ensure that addr is inside guest's RAM */

+    if ( addr < gma_start ||

+         addr > gma_end ) return 0;

+

+    vlp2m_pte = get_vlpt_3lvl_pte(addr);

+    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&

+         vlp2m_pte->p2m.avail == 0 /* reuse avail bit as read-only */ )

+    {

+        lpae_t pte = *vlp2m_pte;

+        pte.p2m.write = 1;

+        write_pte(vlp2m_pte, pte);

+        flush_tlb_local();

+

+        /* only necessary to lock between get-dirty bitmap and mark dirty

+         * bitmap. If get-dirty bitmap happens immediately before this

+         * lock, the corresponding dirty-page would be marked at the next

+         * round of get-dirty bitmap */

+        spin_lock(&d->arch.dirty.lock);

+        mark_dirty_bitmap(d, addr);

+        spin_unlock(&d->arch.dirty.lock);

+    }

+

+    return 1;

+}

+

void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end)

{

     if ( start )

@@ -1440,6 +1493,54 @@ void cleanup_vlpt(struct domain *d)

     unmap_domain_page_global(d->arch.dirty.second_lvl[0]);

     unmap_domain_page_global(d->arch.dirty.second_lvl[1]);

}

+

+int prepare_bitmap(struct domain *d)

+{

+    paddr_t gma_start = 0;

+    paddr_t gma_end = 0;

+    int nr_bytes;

+    int nr_pages;

+    int i;

+

+    get_gma_start_end(d, &gma_start, &gma_end);

+

+    nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;

+    nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;

+

+    BUG_ON( nr_pages > MAX_DIRTY_BITMAP_PAGES );

+

+    for ( i = 0; i < nr_pages; ++i )

+    {

+        struct page_info *page;

+        page = alloc_domheap_page(NULL, 0);

+        if ( page == NULL )

+            goto cleanup_on_failure;

+

+        d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page));

+        clear_page(d->arch.dirty.bitmap[i]);

+    }

+

+    d->arch.dirty.bitmap_pages = nr_pages;

+    return 0;

+

+cleanup_on_failure:

+    nr_pages = i;

+    for ( i = 0; i < nr_pages; ++i )

+    {

+        unmap_domain_page_global(d->arch.dirty.bitmap[i]);

+    }

+    return -ENOMEM;

+}

+

+void cleanup_bitmap(struct domain *d)

+{

+    int i;

+    for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )

+    {

+        unmap_domain_page_global(d->arch.dirty.bitmap[i]);

+    }

+}

+

/*

  * Local variables:

  * mode: C

diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c

index 2d09fef..b7dbf7d 100644

--- a/xen/arch/arm/p2m.c

+++ b/xen/arch/arm/p2m.c

@@ -6,6 +6,8 @@

#include <xen/bitops.h>

#include <asm/flushtlb.h>

#include <asm/gic.h>

+#include <xen/guest_access.h>

+#include <xen/pfn.h>

 

void dump_p2m_lookup(struct domain *d, paddr_t addr)

{

@@ -113,6 +115,10 @@ static int p2m_create_table(struct domain *d,

 

     pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM);

 

+    /* mark the write bit (page table's case, ro bit) as 0

+     * so, it is writable in case of vlpt access */

+    pte.pt.ro = 0;

+

     write_pte(entry, pte);

 

     return 0;

@@ -408,6 +414,206 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)

     return p >> PAGE_SHIFT;

}

 

+/* Change types across all p2m entries in a domain */

+void p2m_change_entry_type_global(struct domain *d, enum mg nt)

+{

+    struct p2m_domain *p2m = &d->arch.p2m;

+    paddr_t ram_base;

+    int i1, i2, i3;

+    int first_index, second_index, third_index;

+    lpae_t *first = __map_domain_page(p2m->first_level);

+    lpae_t pte, *second = NULL, *third = NULL;

+

+    get_gma_start_end(d, &ram_base, NULL);

+

+    first_index = first_table_offset((uint64_t)ram_base);

+    second_index = second_table_offset((uint64_t)ram_base);

+    third_index = third_table_offset((uint64_t)ram_base);

+

+    BUG_ON( !first && "Can't map first level p2m." );

+

+    spin_lock(&p2m->lock);

+

+    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )

+    {

+        lpae_walk_t first_pte = first[i1].walk;

+        if ( !first_pte.valid || !first_pte.table )

+            goto out;

+

+        second = map_domain_page(first_pte.base);

+        BUG_ON( !second && "Can't map second level p2m.");

+        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )

+        {

+            lpae_walk_t second_pte = second[i2].walk;

+

+            if ( !second_pte.valid || !second_pte.table )

+                goto out;

+

+            third = map_domain_page(second_pte.base);

+            BUG_ON( !third && "Can't map third level p2m.");

+

+            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )

+            {

+                lpae_walk_t third_pte = third[i3].walk;

+                if ( !third_pte.valid )

+                    goto out;

+

+                pte = third[i3];

+                if ( nt == mg_ro )

+                {

+                    if ( pte.p2m.write == 1 )

+                    {

+                        pte.p2m.write = 0;

+                        pte.p2m.avail = 0;

+                    }

+                    else

+                    {

+                        /* reuse avail bit as an indicator

+                         * of 'actual' read-only */

+                        pte.p2m.avail = 1;

+                    }

+                }

+                else if ( nt == mg_rw )

+                {

+                    if ( pte.p2m.write == 0 && pte.p2m.avail == 0 )

+                    {

+                        pte.p2m.write = 1;

+                    }

+                }

+                write_pte(&third[i3], pte);

+            }

+            unmap_domain_page(third);

+

+            third = NULL;

+            third_index = 0;

+        }

+        unmap_domain_page(second);

+

+        second = NULL;

+        second_index = 0;

+        third_index = 0;

+    }

+

+out:

+    flush_tlb_all_local();

+    if ( third ) unmap_domain_page(third);

+    if ( second ) unmap_domain_page(second);

+    if ( first ) unmap_domain_page(first);

+

+    spin_unlock(&p2m->lock);

+}

+

+/* Read a domain's log-dirty bitmap and stats.

+ * If the operation is a CLEAN, clear the bitmap and stats. */

+int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)

+{

+    int peek = 1;

+    int i;

+    int bitmap_size;

+    paddr_t gma_start, gma_end;

+

+    /* this hypercall is called from domain 0, and we don't know which guest's

+     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */

+    restore_vlpt(d);

+

+    get_gma_start_end(d, &gma_start, &gma_end);

+    bitmap_size = (gma_end - gma_start) / 8;

+

+    if ( guest_handle_is_null(sc->dirty_bitmap) )

+    {

+        peek = 0;

+    }

+    else

+    {

+        spin_lock(&d->arch.dirty.lock);

+        for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )

+        {

+            int j = 0;

+            uint8_t *bitmap;

+            copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE,

+                                 d->arch.dirty.bitmap[i],

+                                 bitmap_size < PAGE_SIZE ? bitmap_size :

+                                                           PAGE_SIZE);

+            bitmap_size -= PAGE_SIZE;

+

+            /* set p2m page table read-only */

+            bitmap = d->arch.dirty.bitmap[i];

+            while ((j = find_next_bit((const long unsigned int *)bitmap,

+                                      PAGE_SIZE*8, j)) < PAGE_SIZE*8)

+            {

+                lpae_t *vlpt;

+                paddr_t addr = gma_start +

+                               (i << (2*PAGE_SHIFT+3)) +

+                               (j << PAGE_SHIFT);

+                vlpt = get_vlpt_3lvl_pte(addr);

+                vlpt->p2m.write = 0;

+                j++;

+            }

+        }

+

+        if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )

+        {

+            for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )

+            {

+                clear_page(d->arch.dirty.bitmap[i]);

+            }

+        }

+

+        spin_unlock(&d->arch.dirty.lock);

+        flush_tlb_local();

+    }

+

+    sc->stats.dirty_count = d->arch.dirty.count;

+

+    return 0;

+}

+

+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)

+{

+    long ret = 0;

+    switch (sc->op)

+    {

+        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:

+        case XEN_DOMCTL_SHADOW_OP_OFF:

+        {

+            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;

+

+            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;

+            p2m_change_entry_type_global(d, nt);

+

+            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )

+            {

+                cleanup_vlpt(d);

+                cleanup_bitmap(d);

+            }

+            else

+            {

+                if ( (ret = prepare_vlpt(d)) )

+                   return ret;

+

+                if ( (ret = prepare_bitmap(d)) )

+                {

+                   /* in case of failure, we have to cleanup vlpt */

+                   cleanup_vlpt(d);

+                   return ret;

+                }

+            }

+        }

+        break;

+

+        case XEN_DOMCTL_SHADOW_OP_CLEAN:

+        case XEN_DOMCTL_SHADOW_OP_PEEK:

+        {

+            ret = log_dirty_op(d, sc);

+        }

+        break;

+

+        default:

+            return -ENOSYS;

+    }

+    return ret;

+}

+

/*

  * Local variables:

  * mode: C

diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c

index 287dd7b..1a7ed11 100644

--- a/xen/arch/arm/traps.c

+++ b/xen/arch/arm/traps.c

@@ -1321,6 +1321,8 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,

     const char *msg;

     int rc, level = -1;

     mmio_info_t info;

+    int page_fault = ( (dabt.dfsc & FSC_MASK) ==

+                          (FSC_FLT_PERM | FSC_3D_LEVEL) && dabt.write );

 

     if ( !check_conditional_instr(regs, hsr) )

     {

@@ -1342,6 +1344,13 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,

     if ( rc == -EFAULT )

         goto bad_data_abort;

 

+    /* domU page fault handling for guest live migration */

+    /* dabt.valid can be 0 here */

+    if ( page_fault && handle_page_fault(current->domain, info.gpa) )

+    {

+        /* Do not modify pc after page fault to repeat memory operation */

+        return;

+    }

     /* XXX: Decode the instruction if ISS is not valid */

     if ( !dabt.valid )

         goto bad_data_abort;

diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h

index 4f366f1..180d924 100644

--- a/xen/include/asm-arm/domain.h

+++ b/xen/include/asm-arm/domain.h

@@ -114,9 +114,16 @@ struct arch_domain

 

     /* dirty-page tracing */

     struct {

+#define MAX_DIRTY_BITMAP_PAGES 64        /* support upto 8GB guest memory */

+        spinlock_t lock;                 /* protect list: head, mvn_head */

+        volatile int mode;               /* 1 if dirty pages tracing enabled */

+        volatile unsigned int count;     /* dirty pages counter */

         volatile int second_lvl_start;   /* for context switch */

         volatile int second_lvl_end;

         lpae_t *second_lvl[2];           /* copy of guest p2m's first */

+        /* dirty bitmap */

+        uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES];

+        int bitmap_pages;                /* number of dirty bitmap pages */

     } dirty;

 

}  __cacheline_aligned;

diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h

index a74e135..1ce7a4b 100644

--- a/xen/include/asm-arm/mm.h

+++ b/xen/include/asm-arm/mm.h

@@ -341,11 +341,18 @@ static inline void put_page_and_type(struct page_info *page)

     put_page(page);

}

 

+enum mg { mg_clear, mg_ro, mg_rw, mg_rx };

+

+/* routine for dirty-page tracing */

+int handle_page_fault(struct domain *d, paddr_t addr);

void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end);

int prepare_vlpt(struct domain *d);

void cleanup_vlpt(struct domain *d);

void restore_vlpt(struct domain *d);

 

+int prepare_bitmap(struct domain *d);

+void cleanup_bitmap(struct domain *d);

+

/* calculate the xen's virtual address for accessing the leaf PTE of

  * a given address (GPA) */

static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr)

diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h

index c660820..dba9a7b 100644

--- a/xen/include/asm-arm/p2m.h

+++ b/xen/include/asm-arm/p2m.h

@@ -2,6 +2,7 @@

#define _XEN_P2M_H

 

#include <xen/mm.h>

+#include <public/domctl.h>

 

struct domain;

 

@@ -110,6 +111,9 @@ static inline int get_page_and_type(struct page_info *page,

     return rc;

}

 

+void p2m_change_entry_type_global(struct domain *d, enum mg nt);

+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);

+

#endif /* _XEN_P2M_H */

 

/*

diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h

index 5294421..fced6ad 100644

--- a/xen/include/asm-arm/processor.h

+++ b/xen/include/asm-arm/processor.h

@@ -399,6 +399,8 @@ union hsr {

#define FSC_CPR        (0x3a) /* Coprocossor Abort */

 

#define FSC_LL_MASK    (0x03<<0)

+#define FSC_MASK       (0x3f) /* Fault status mask */

+#define FSC_3D_LEVEL   (0x03) /* Third level fault*/

 

/* Time counter hypervisor control register */

#define CNTHCTL_PA      (1u<<0)  /* Kernel/user access to physical counter */

--

1.8.1.2

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.