[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [IA64] live migration



# HG changeset patch
# User awilliam@xxxxxxxxxxx
# Node ID 86e5d8458c08d057bacd7c578bfa84a219b3d461
# Parent  e585c2dade143d171fb589e5a7a33b6c1fa137a9
[IA64] live migration

Shadow mode and live migration.

Virtualize Dirty bit.

Signed-off-by: Tristan Gingold <tristan.gingold@xxxxxxxx>
---
 tools/libxc/ia64/xc_ia64_linux_restore.c     |    2 
 tools/libxc/ia64/xc_ia64_linux_save.c        |  314 ++++++++++++++++++++++-----
 xen/arch/ia64/asm-offsets.c                  |    5 
 xen/arch/ia64/xen/dom0_ops.c                 |   14 +
 xen/arch/ia64/xen/domain.c                   |  163 ++++++++++++--
 xen/arch/ia64/xen/faults.c                   |   91 +++++++
 xen/arch/ia64/xen/ivt.S                      |   43 +++
 xen/arch/ia64/xen/mm.c                       |   20 +
 xen/arch/ia64/xen/privop.c                   |    3 
 xen/arch/ia64/xen/vhpt.c                     |    2 
 xen/include/asm-ia64/domain.h                |   13 +
 xen/include/asm-ia64/linux-xen/asm/pgtable.h |    5 
 xen/include/asm-ia64/shadow.h                |   18 +
 xen/include/asm-ia64/tlbflush.h              |    4 
 14 files changed, 623 insertions(+), 74 deletions(-)

diff -r e585c2dade14 -r 86e5d8458c08 tools/libxc/ia64/xc_ia64_linux_restore.c
--- a/tools/libxc/ia64/xc_ia64_linux_restore.c  Wed Jul 26 09:02:43 2006 -0600
+++ b/tools/libxc/ia64/xc_ia64_linux_restore.c  Wed Jul 26 09:36:36 2006 -0600
@@ -163,7 +163,7 @@ xc_linux_restore(int xc_handle, int io_f
 
        pfn = page_array[mfn];
 
-        DPRINTF ("xc_linux_restore: page %lu/%lu at %lx\n", mfn, max_pfn, pfn);
+        //DPRINTF("xc_linux_restore: page %lu/%lu at %lx\n", mfn, max_pfn, 
pfn);
 
        if (read_page(xc_handle, io_fd, dom, page_array[mfn]) < 0)
                goto out;
diff -r e585c2dade14 -r 86e5d8458c08 tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c     Wed Jul 26 09:02:43 2006 -0600
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c     Wed Jul 26 09:36:36 2006 -0600
@@ -15,8 +15,72 @@
 
 #include "xg_private.h"
 
+/*
+** Default values for important tuning parameters. Can override by passing
+** non-zero replacement values to xc_linux_save().
+**
+** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
+**
+*/
+#define DEF_MAX_ITERS    (4 - 1)       /* limit us to 4 times round loop  */
+#define DEF_MAX_FACTOR   3             /* never send more than 3x nr_pfns */
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
 /* total number of pages used by the current guest */
 static unsigned long max_pfn;
+
+static int xc_ia64_shadow_control(int xc_handle,
+                                  uint32_t domid,
+                                  unsigned int sop,
+                                  unsigned long *dirty_bitmap,
+                                  unsigned long pages,
+                                  xc_shadow_control_stats_t *stats)
+{
+    if (dirty_bitmap != NULL && pages > 0) {
+        int i;
+        unsigned char *bmap = (unsigned char *)dirty_bitmap;
+        unsigned long bmap_bytes =
+            ((pages + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1)) / 8;
+        unsigned int bmap_pages = (bmap_bytes + PAGE_SIZE - 1) / PAGE_SIZE; 
+
+        /* Touch the page so that it is in the TC.
+           FIXME: use a more reliable method.  */
+        for (i = 0 ; i < bmap_pages ; i++)
+            bmap[i * PAGE_SIZE] = 0;
+        /* Because bmap is not page aligned (allocated by malloc), be sure the
+           last page is touched.  */
+        bmap[bmap_bytes - 1] = 0;
+    }
+
+    return xc_shadow_control(xc_handle, domid, sop,
+                             dirty_bitmap, pages, stats);
+}
 
 static inline ssize_t
 write_exact(int fd, void *buf, size_t count)
@@ -77,10 +141,10 @@ xc_linux_save(int xc_handle, int io_fd, 
     xc_dominfo_t info;
 
     int rc = 1;
-    unsigned long N;
 
     //int live  = (flags & XCFLAGS_LIVE);
     int debug = (flags & XCFLAGS_DEBUG);
+    int live  = (flags & XCFLAGS_LIVE);
 
     /* The new domain's shared-info frame number. */
     unsigned long shared_info_frame;
@@ -93,10 +157,38 @@ xc_linux_save(int xc_handle, int io_fd, 
     /* Live mapping of shared info structure */
     shared_info_t *live_shinfo = NULL;
 
+    /* Iteration number.  */
+    int iter;
+
+    /* Number of pages sent in the last iteration (live only).  */
+    unsigned int sent_last_iter;
+
+    /* Number of pages sent (live only).  */
+    unsigned int total_sent;
+
+    /* Size of the shadow bitmap (live only).  */
+    unsigned int bitmap_size = 0;
+
+    /* True if last iteration.  */
+    int last_iter;
+
+    /* Bitmap of pages to be sent.  */
+    unsigned long *to_send = NULL;
+    /* Bitmap of pages not to be sent (because dirtied).  */
+    unsigned long *to_skip = NULL;
+
     char *mem;
 
     if (debug)
         fprintf (stderr, "xc_linux_save (ia64): started dom=%d\n", dom);
+
+    /* If no explicit control parameters given, use defaults */
+    if (!max_iters)
+        max_iters = DEF_MAX_ITERS;
+    if (!max_factor)
+        max_factor = DEF_MAX_FACTOR;
+
+    //initialize_mbit_rate();
 
     if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
         ERR("Could not get domain info");
@@ -124,24 +216,9 @@ xc_linux_save(int xc_handle, int io_fd, 
 
     max_pfn = info.max_memkb >> (PAGE_SHIFT - 10);
 
-
-    /* This is a non-live suspend. Issue the call back to get the
-       domain suspended */
-
-    if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info)) {
-        ERR("Domain appears not to have suspended");
-        goto out;
-    }
-
     page_array = malloc(max_pfn * sizeof(unsigned long));
     if (page_array == NULL) {
         ERR("Could not allocate memory");
-        goto out;
-    }
-
-    if (xc_ia64_get_pfn_list(xc_handle, dom, page_array,
-                             0, max_pfn) != max_pfn) {
-        ERR("Could not get the page frame list");
         goto out;
     }
 
@@ -156,10 +233,13 @@ xc_linux_save(int xc_handle, int io_fd, 
        if the format change.
        The version is hard-coded, don't forget to change the restore code
        too!  */
-    N = 1;
-    if (!write_exact(io_fd, &N, sizeof(unsigned long))) {
-        ERR("write: version");
-        goto out;
+    {
+        unsigned long version = 1;
+
+        if (!write_exact(io_fd, &version, sizeof(unsigned long))) {
+            ERR("write: version");
+            goto out;
+        }
     }
 
     op.cmd = DOM0_DOMAIN_SETUP;
@@ -175,39 +255,165 @@ xc_linux_save(int xc_handle, int io_fd, 
         goto out;
     }
 
-    /* Start writing out the saved-domain record. */
-    for (N = 0; N < max_pfn; N++) {
-        if (page_array[N] == INVALID_MFN)
-            continue;
-        if (debug)
-            fprintf (stderr, "xc_linux_save: page %lx (%lu/%lu)\n",
-                     page_array[N], N, max_pfn);
-
-        if (!write_exact(io_fd, &N, sizeof(N))) {
-            ERR("write: max_pfn");
-            goto out;
-        }
-
-        mem = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                   PROT_READ|PROT_WRITE, page_array[N]);
-        if (mem == NULL) {
-            ERR("cannot map page");
-            goto out;
-        }
-        if (write(io_fd, mem, PAGE_SIZE) != PAGE_SIZE) {
-            ERR("Error when writing to state file (5)");
-            goto out;
-        }
-        munmap(mem, PAGE_SIZE);
+    /* Domain is still running at this point */
+    if (live) {
+
+        if (xc_ia64_shadow_control(xc_handle, dom,
+                                   DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
+                                   NULL, 0, NULL ) < 0) {
+            ERR("Couldn't enable shadow mode");
+            goto out;
+        }
+
+        last_iter = 0;
+
+        bitmap_size = ((max_pfn + BITS_PER_LONG-1) & ~(BITS_PER_LONG-1)) / 8;
+        to_send = malloc(bitmap_size);
+        to_skip = malloc(bitmap_size);
+
+        if (!to_send || !to_skip) {
+            ERR("Couldn't allocate bitmap array");
+            goto out;
+        }
+
+        /* Initially all the pages must be sent.  */
+        memset(to_send, 0xff, bitmap_size);
+
+        if (mlock(to_send, bitmap_size)) {
+            ERR("Unable to mlock to_send");
+            goto out;
+        }
+        if (mlock(to_skip, bitmap_size)) {
+            ERR("Unable to mlock to_skip");
+            goto out;
+        }
+        
+    } else {
+
+        /* This is a non-live suspend. Issue the call back to get the
+           domain suspended */
+
+        last_iter = 1;
+
+        if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info)) {
+            ERR("Domain appears not to have suspended");
+            goto out;
+        }
+
+    }
+
+    sent_last_iter = max_pfn;
+    total_sent = 0;
+
+    for (iter = 1; ; iter++) {
+        unsigned int sent_this_iter, skip_this_iter;
+        unsigned long N;
+
+        sent_this_iter = 0;
+        skip_this_iter = 0;
+
+        /* Get the pfn list, as it may change.  */
+        if (xc_ia64_get_pfn_list(xc_handle, dom, page_array,
+                                 0, max_pfn) != max_pfn) {
+            ERR("Could not get the page frame list");
+            goto out;
+        }
+
+        /* Dirtied pages won't be saved.
+           slightly wasteful to peek the whole array evey time,
+           but this is fast enough for the moment. */
+        if (!last_iter) {
+            if (xc_ia64_shadow_control(xc_handle, dom,
+                                       DOM0_SHADOW_CONTROL_OP_PEEK,
+                                       to_skip, max_pfn, NULL) != max_pfn) {
+                ERR("Error peeking shadow bitmap");
+                goto out;
+            }
+        }
+
+        /* Start writing out the saved-domain record. */
+        for (N = 0; N < max_pfn; N++) {
+            if (page_array[N] == INVALID_MFN)
+                continue;
+            if (!last_iter) {
+                if (test_bit(N, to_skip) && test_bit(N, to_send))
+                    skip_this_iter++;
+                if (test_bit(N, to_skip) || !test_bit(N, to_send))
+                    continue;
+            }
+
+            if (debug)
+                fprintf(stderr, "xc_linux_save: page %lx (%lu/%lu)\n",
+                        page_array[N], N, max_pfn);
+
+            mem = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                       PROT_READ|PROT_WRITE, page_array[N]);
+            if (mem == NULL) {
+                /* The page may have move.
+                   It will be remarked dirty.
+                   FIXME: to be tracked.  */
+                fprintf(stderr, "cannot map page %lx: %s\n",
+                        page_array[N], strerror (errno));
+                continue;
+            }
+
+            if (!write_exact(io_fd, &N, sizeof(N))) {
+                ERR("write: max_pfn");
+                goto out;
+            }
+
+            if (write(io_fd, mem, PAGE_SIZE) != PAGE_SIZE) {
+                ERR("Error when writing to state file (5)");
+                goto out;
+            }
+            munmap(mem, PAGE_SIZE);
+            sent_this_iter++;
+            total_sent++;
+        }
+
+        if (last_iter)
+            break;
+
+        DPRINTF(" %d: sent %d, skipped %d\n",
+                iter, sent_this_iter, skip_this_iter );
+
+        if (live) {
+            if ( /* ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) || */
+                (iter >= max_iters) || (sent_this_iter+skip_this_iter < 50) ||
+                (total_sent > max_pfn*max_factor)) {
+                DPRINTF("Start last iteration\n");
+                last_iter = 1;
+
+                if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info)) {
+                    ERR("Domain appears not to have suspended");
+                    goto out;
+                }
+            }
+
+            /* Pages to be sent are pages which were dirty.  */
+            if (xc_ia64_shadow_control(xc_handle, dom,
+                                       DOM0_SHADOW_CONTROL_OP_CLEAN,
+                                       to_send, max_pfn, NULL ) != max_pfn) {
+                ERR("Error flushing shadow PT");
+                goto out;
+            }
+
+            sent_last_iter = sent_this_iter;
+
+            //print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
+        }
+
     }
 
     fprintf (stderr, "All memory is saved\n");
 
     /* terminate */
-    N = INVALID_MFN;
-    if (!write_exact(io_fd, &N, sizeof(N))) {
-        ERR("Error when writing to state file (6)");
-        goto out;
+    {
+        unsigned long pfn = INVALID_MFN;
+        if (!write_exact(io_fd, &pfn, sizeof(pfn))) {
+            ERR("Error when writing to state file (6)");
+            goto out;
+        }
     }
 
     /* Send through a list of all the PFNs that were not in map at the close */
@@ -274,8 +480,16 @@ xc_linux_save(int xc_handle, int io_fd, 
 
  out:
 
-    free (page_array);
-
+    if (live) {
+        if (xc_ia64_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
+                                   NULL, 0, NULL ) < 0) {
+            DPRINTF("Warning - couldn't disable shadow mode");
+        }
+    }
+
+    free(page_array);
+    free(to_send);
+    free(to_skip);
     if (live_shinfo)
         munmap(live_shinfo, PAGE_SIZE);
 
diff -r e585c2dade14 -r 86e5d8458c08 xen/arch/ia64/asm-offsets.c
--- a/xen/arch/ia64/asm-offsets.c       Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/arch/ia64/asm-offsets.c       Wed Jul 26 09:36:36 2006 -0600
@@ -65,6 +65,11 @@ void foo(void)
        DEFINE(IA64_VCPU_DTLB_OFFSET, offsetof (struct vcpu, arch.dtlb));
 
        BLANK();
+
+       DEFINE(IA64_DOMAIN_SHADOW_BITMAP_OFFSET, offsetof (struct domain, 
arch.shadow_bitmap));
+
+       BLANK();
+
        DEFINE(IA64_CPUINFO_ITM_NEXT_OFFSET, offsetof (struct cpuinfo_ia64, 
itm_next));
        DEFINE(IA64_CPUINFO_KSOFTIRQD_OFFSET, offsetof (struct cpuinfo_ia64, 
ksoftirqd));
 
diff -r e585c2dade14 -r 86e5d8458c08 xen/arch/ia64/xen/dom0_ops.c
--- a/xen/arch/ia64/xen/dom0_ops.c      Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/arch/ia64/xen/dom0_ops.c      Wed Jul 26 09:36:36 2006 -0600
@@ -265,6 +265,20 @@ long arch_do_dom0_op(dom0_op_t *op, XEN_
     }
     break;
 
+    case DOM0_SHADOW_CONTROL:
+    {
+        struct domain *d; 
+        ret = -ESRCH;
+        d = find_domain_by_id(op->u.shadow_control.domain);
+        if ( d != NULL )
+        {
+            ret = shadow_mode_control(d, &op->u.shadow_control);
+            put_domain(d);
+            copy_to_guest(u_dom0_op, op, 1);
+        } 
+    }
+    break;
+
     default:
         printf("arch_do_dom0_op: unrecognized dom0 op: %d!!!\n",op->cmd);
         ret = -ENOSYS;
diff -r e585c2dade14 -r 86e5d8458c08 xen/arch/ia64/xen/domain.c
--- a/xen/arch/ia64/xen/domain.c        Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/arch/ia64/xen/domain.c        Wed Jul 26 09:36:36 2006 -0600
@@ -25,26 +25,15 @@
 #include <xen/mm.h>
 #include <xen/iocap.h>
 #include <asm/asm-xsi-offsets.h>
-#include <asm/ptrace.h>
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/hw_irq.h>
-#include <asm/setup.h>
-//#include <asm/mpspec.h>
-#include <xen/irq.h>
 #include <xen/event.h>
-//#include <xen/shadow.h>
 #include <xen/console.h>
 #include <xen/compile.h>
-
 #include <xen/elf.h>
-//#include <asm/page.h>
 #include <asm/pgalloc.h>
-
 #include <asm/offsets.h>  /* for IA64_THREAD_INFO_SIZE */
-
 #include <asm/vcpu.h>   /* for function declarations */
 #include <public/arch-ia64.h>
 #include <xen/domain.h>
@@ -52,13 +41,12 @@
 #include <asm/vmx_vcpu.h>
 #include <asm/vmx_vpd.h>
 #include <asm/vmx_phy_mode.h>
-#include <asm/pal.h>
 #include <asm/vhpt.h>
-#include <public/hvm/ioreq.h>
 #include <public/arch-ia64.h>
 #include <asm/tlbflush.h>
 #include <asm/regionreg.h>
 #include <asm/dom_fw.h>
+#include <asm/shadow.h>
 #include <asm/privop_stat.h>
 
 #ifndef CONFIG_XEN_IA64_DOM0_VP
@@ -388,8 +376,11 @@ void arch_domain_destroy(struct domain *
        BUG_ON(d->arch.mm.pgd != NULL);
        if (d->shared_info != NULL)
            free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
-
-       domain_flush_destroy (d);
+       if (d->arch.shadow_bitmap != NULL)
+               xfree(d->arch.shadow_bitmap);
+
+       /* Clear vTLB for the next domain.  */
+       domain_flush_tlb_vhpt(d);
 
        deallocate_rid_range(d);
 }
@@ -594,6 +585,148 @@ domain_set_shared_info_va (unsigned long
        return 0;
 }
 
+/* Transfer and clear the shadow bitmap in 1kB chunks for L1 cache. */
+#define SHADOW_COPY_CHUNK (1024 / sizeof (unsigned long))
+
+int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc)
+{
+       unsigned int op = sc->op;
+       int          rc = 0;
+       int i;
+       //struct vcpu *v;
+
+       if (unlikely(d == current->domain)) {
+               DPRINTK("Don't try to do a shadow op on yourself!\n");
+               return -EINVAL;
+       }   
+
+       domain_pause(d);
+
+       switch (op)
+       {
+       case DOM0_SHADOW_CONTROL_OP_OFF:
+               if (shadow_mode_enabled (d)) {
+                       u64 *bm = d->arch.shadow_bitmap;
+
+                       /* Flush vhpt and tlb to restore dirty bit usage.  */
+                       domain_flush_tlb_vhpt(d);
+
+                       /* Free bitmap.  */
+                       d->arch.shadow_bitmap_size = 0;
+                       d->arch.shadow_bitmap = NULL;
+                       xfree(bm);
+               }
+               break;
+
+       case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
+       case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE:
+               rc = -EINVAL;
+               break;
+
+       case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
+               if (shadow_mode_enabled(d)) {
+                       rc = -EINVAL;
+                       break;
+               }
+
+               atomic64_set(&d->arch.shadow_fault_count, 0);
+               atomic64_set(&d->arch.shadow_dirty_count, 0);
+
+               d->arch.shadow_bitmap_size = (d->max_pages + BITS_PER_LONG-1) &
+                                            ~(BITS_PER_LONG-1);
+               d->arch.shadow_bitmap = xmalloc_array(unsigned long,
+                                  d->arch.shadow_bitmap_size / BITS_PER_LONG);
+               if (d->arch.shadow_bitmap == NULL) {
+                       d->arch.shadow_bitmap_size = 0;
+                       rc = -ENOMEM;
+               }
+               else {
+                       memset(d->arch.shadow_bitmap, 0, 
+                              d->arch.shadow_bitmap_size / 8);
+                       
+                       /* Flush vhtp and tlb to enable dirty bit
+                          virtualization.  */
+                       domain_flush_tlb_vhpt(d);
+               }
+               break;
+
+       case DOM0_SHADOW_CONTROL_OP_FLUSH:
+               atomic64_set(&d->arch.shadow_fault_count, 0);
+               atomic64_set(&d->arch.shadow_dirty_count, 0);
+               break;
+   
+       case DOM0_SHADOW_CONTROL_OP_CLEAN:
+         {
+               int nbr_longs;
+
+               sc->stats.fault_count = 
atomic64_read(&d->arch.shadow_fault_count);
+               sc->stats.dirty_count = 
atomic64_read(&d->arch.shadow_dirty_count);
+
+               atomic64_set(&d->arch.shadow_fault_count, 0);
+               atomic64_set(&d->arch.shadow_dirty_count, 0);
+ 
+               if (guest_handle_is_null(sc->dirty_bitmap) ||
+                   (d->arch.shadow_bitmap == NULL)) {
+                       rc = -EINVAL;
+                       break;
+               }
+
+               if (sc->pages > d->arch.shadow_bitmap_size)
+                       sc->pages = d->arch.shadow_bitmap_size; 
+
+               nbr_longs = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
+
+               for (i = 0; i < nbr_longs; i += SHADOW_COPY_CHUNK) {
+                       int size = (nbr_longs - i) > SHADOW_COPY_CHUNK ?
+                                  SHADOW_COPY_CHUNK : nbr_longs - i;
+     
+                       if (copy_to_guest_offset(sc->dirty_bitmap, i,
+                                                d->arch.shadow_bitmap + i,
+                                                size)) {
+                               rc = -EFAULT;
+                               break;
+                       }
+
+                       memset(d->arch.shadow_bitmap + i,
+                              0, size * sizeof(unsigned long));
+               }
+               
+               break;
+         }
+
+       case DOM0_SHADOW_CONTROL_OP_PEEK:
+       {
+               unsigned long size;
+
+               sc->stats.fault_count = 
atomic64_read(&d->arch.shadow_fault_count);
+               sc->stats.dirty_count = 
atomic64_read(&d->arch.shadow_dirty_count);
+
+               if (guest_handle_is_null(sc->dirty_bitmap) ||
+                   (d->arch.shadow_bitmap == NULL)) {
+                       rc = -EINVAL;
+                       break;
+               }
+ 
+               if (sc->pages > d->arch.shadow_bitmap_size)
+                       sc->pages = d->arch.shadow_bitmap_size; 
+
+               size = (sc->pages + BITS_PER_LONG - 1) / BITS_PER_LONG;
+               if (copy_to_guest(sc->dirty_bitmap, 
+                                 d->arch.shadow_bitmap, size)) {
+                       rc = -EFAULT;
+                       break;
+               }
+               break;
+       }
+       default:
+               rc = -EINVAL;
+               break;
+       }
+       
+       domain_unpause(d);
+       
+       return rc;
+}
 
 // remove following line if not privifying in memory
 //#define HAVE_PRIVIFY_MEMORY
diff -r e585c2dade14 -r 86e5d8458c08 xen/arch/ia64/xen/faults.c
--- a/xen/arch/ia64/xen/faults.c        Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/arch/ia64/xen/faults.c        Wed Jul 26 09:36:36 2006 -0600
@@ -1,4 +1,3 @@
-
 /*
  * Miscellaneous process/domain related routines
  * 
@@ -29,6 +28,7 @@
 #include <asm/bundle.h>
 #include <asm/privop_stat.h>
 #include <asm/asm-xsi-offsets.h>
+#include <asm/shadow.h>
 
 extern void die_if_kernel(char *str, struct pt_regs *regs, long err);
 /* FIXME: where these declarations shold be there ? */
@@ -648,3 +648,92 @@ ia64_handle_reflection (unsigned long if
        reflect_interruption(isr,regs,vector);
 }
 
+void
+ia64_shadow_fault(unsigned long ifa, unsigned long itir,
+                  unsigned long isr, struct pt_regs *regs)
+{
+       struct vcpu *v = current;
+       struct domain *d = current->domain;
+       unsigned long gpfn;
+       unsigned long pte = 0;
+       struct vhpt_lf_entry *vlfe;
+
+       /* There are 2 jobs to do:
+          -  marking the page as dirty (the metaphysical address must be
+             extracted to do that).
+          -  reflecting or not the fault (the virtual Dirty bit must be
+             extracted to decide).
+          Unfortunatly these informations are not immediatly available!
+       */
+
+       /* Extract the metaphysical address.
+          Try to get it from VHPT and M2P as we need the flags.  */
+       vlfe = (struct vhpt_lf_entry *)ia64_thash(ifa);
+       pte = vlfe->page_flags;
+       if (vlfe->ti_tag == ia64_ttag(ifa)) {
+               /* The VHPT entry is valid.  */
+               gpfn = get_gpfn_from_mfn((pte & _PAGE_PPN_MASK) >> PAGE_SHIFT);
+               BUG_ON(gpfn == INVALID_M2P_ENTRY);
+       }
+       else {
+               unsigned long itir, iha;
+               IA64FAULT fault;
+
+               /* The VHPT entry is not valid.  */
+               vlfe = NULL;
+
+               /* FIXME: gives a chance to tpa, as the TC was valid.  */
+
+               fault = vcpu_translate(v, ifa, 1, &pte, &itir, &iha);
+
+               /* Try again!  */
+               if (fault != IA64_NO_FAULT) {
+                       /* This will trigger a dtlb miss.  */
+                       ia64_ptcl(ifa, PAGE_SHIFT << 2);
+                       return;
+               }
+               gpfn = ((pte & _PAGE_PPN_MASK) >> PAGE_SHIFT);
+               if (pte & _PAGE_D)
+                       pte |= _PAGE_VIRT_D;
+       }
+
+       /* Set the dirty bit in the bitmap.  */
+       shadow_mark_page_dirty (d, gpfn);
+
+       /* Update the local TC/VHPT and decides wether or not the fault should
+          be reflected.
+          SMP note: we almost ignore the other processors.  The shadow_bitmap
+          has been atomically updated.  If the dirty fault happen on another
+          processor, it will do its job.
+       */
+
+       if (pte != 0) {
+               /* We will know how to handle the fault.  */
+
+               if (pte & _PAGE_VIRT_D) {
+                       /* Rewrite VHPT entry.
+                          There is no race here because only the
+                          cpu VHPT owner can write page_flags.  */
+                       if (vlfe)
+                               vlfe->page_flags = pte | _PAGE_D;
+                       
+                       /* Purge the TC locally.
+                          It will be reloaded from the VHPT iff the
+                          VHPT entry is still valid.  */
+                       ia64_ptcl(ifa, PAGE_SHIFT << 2);
+
+                       atomic64_inc(&d->arch.shadow_fault_count);
+               }
+               else {
+                       /* Reflect.
+                          In this case there is no need to purge.  */
+                       ia64_handle_reflection(ifa, regs, isr, 0, 8);
+               }
+       }
+       else {
+               /* We don't know wether or not the fault must be
+                  reflected.  The VHPT entry is not valid.  */
+               /* FIXME: in metaphysical mode, we could do an ITC now.  */
+               ia64_ptcl(ifa, PAGE_SHIFT << 2);
+       }
+}
diff -r e585c2dade14 -r 86e5d8458c08 xen/arch/ia64/xen/ivt.S
--- a/xen/arch/ia64/xen/ivt.S   Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/arch/ia64/xen/ivt.S   Wed Jul 26 09:36:36 2006 -0600
@@ -746,7 +746,48 @@ ENTRY(dirty_bit)
 ENTRY(dirty_bit)
        DBG_FAULT(8)
 #ifdef XEN
-       FAULT_OR_REFLECT(8)
+       mov r20=cr.ipsr
+       mov r31=pr;;
+       extr.u r20=r20,IA64_PSR_CPL0_BIT,2;;
+       mov r19=8       /* prepare to save predicates */
+       cmp.eq p6,p0=r0,r20     /* cpl == 0?*/
+(p6)   br.sptk.few dispatch_to_fault_handler
+       /* If shadow mode is not enabled, reflect the fault.  */
+       movl r22=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET
+       ;;
+       ld8 r22=[r22]
+       ;;
+       add r22=IA64_VCPU_DOMAIN_OFFSET,r22
+       ;;
+       /* Read domain.  */
+       ld8 r22=[r22]
+       ;;
+       add r22=IA64_DOMAIN_SHADOW_BITMAP_OFFSET,r22
+       ;;
+       ld8 r22=[r22]
+       ;;
+       cmp.eq p6,p0=r0,r22     /* !shadow_bitmap ?*/
+(p6)   br.dptk.many dispatch_reflection
+
+       SAVE_MIN_WITH_COVER
+       alloc r14=ar.pfs,0,0,4,0
+       mov out0=cr.ifa
+       mov out1=cr.itir
+       mov out2=cr.isr
+       adds out3=16,sp
+
+       ssm psr.ic | PSR_DEFAULT_BITS
+       ;;
+       srlz.i                                  // guarantee that interruption 
collection is on
+       ;;
+(p15)  ssm psr.i                               // restore psr.i
+       adds r3=8,r2                            // set up second base pointer
+       ;;
+       SAVE_REST
+       movl r14=ia64_leave_kernel
+       ;;
+       mov rp=r14
+       br.call.sptk.many b6=ia64_shadow_fault
 #else
        /*
         * What we do here is to simply turn on the dirty bit in the PTE.  We 
need to
diff -r e585c2dade14 -r 86e5d8458c08 xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c    Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/arch/ia64/xen/mm.c    Wed Jul 26 09:36:36 2006 -0600
@@ -170,6 +170,7 @@
 #include <asm/pgalloc.h>
 #include <asm/vhpt.h>
 #include <asm/vcpu.h>
+#include <asm/shadow.h>
 #include <linux/efi.h>
 
 #ifndef CONFIG_XEN_IA64_DOM0_VP
@@ -470,7 +471,7 @@ u64 translate_domain_pte(u64 pteval, u64
        pteval2 &= _PAGE_PPN_MASK; // ignore non-addr bits
        pteval2 |= (pteval & _PAGE_ED);
        pteval2 |= _PAGE_PL_2; // force PL0->2 (PL3 is unaffected)
-       pteval2 = (pteval & ~_PAGE_PPN_MASK) | pteval2;
+       pteval2 |= (pteval & ~_PAGE_PPN_MASK);
        /*
         * Don't let non-dom0 domains map uncached addresses.  This can
         * happen when domU tries to touch i/o port space.  Also prevents
@@ -481,6 +482,18 @@ u64 translate_domain_pte(u64 pteval, u64
         */
        if (d != dom0 && (pteval2 & _PAGE_MA_MASK) != _PAGE_MA_NAT)
                pteval2 &= ~_PAGE_MA_MASK;
+
+    /* If shadow mode is enabled, virtualize dirty bit.  */
+    if (shadow_mode_enabled(d) && (pteval2 & _PAGE_D)) {
+        u64 mp_page = mpaddr >> PAGE_SHIFT;
+        pteval2 |= _PAGE_VIRT_D;
+
+        /* If the page is not already dirty, don't set the dirty bit.
+           This is a small optimization!  */
+        if (mp_page < d->arch.shadow_bitmap_size * 8
+            && !test_bit(mp_page, d->arch.shadow_bitmap))
+            pteval2 = (pteval2 & ~_PAGE_D);
+    }
 
        return pteval2;
 }
@@ -1418,10 +1431,13 @@ guest_physmap_remove_page(struct domain 
 
 //XXX sledgehammer.
 //    flush finer range.
-void
+static void
 domain_page_flush(struct domain* d, unsigned long mpaddr,
                   unsigned long old_mfn, unsigned long new_mfn)
 {
+    if (shadow_mode_enabled(d))
+        shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
+
     domain_flush_vtlb_all();
 }
 
diff -r e585c2dade14 -r 86e5d8458c08 xen/arch/ia64/xen/privop.c
--- a/xen/arch/ia64/xen/privop.c        Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/arch/ia64/xen/privop.c        Wed Jul 26 09:36:36 2006 -0600
@@ -686,7 +686,8 @@ priv_emulate(VCPU *vcpu, REGS *regs, UIN
                (void)vcpu_increment_iip(vcpu);
        }
        if (fault == IA64_ILLOP_FAULT)
-               printf("priv_emulate: priv_handle_op fails, isr=0x%lx\n",isr);
+               printf("priv_emulate: priv_handle_op fails, "
+                      "isr=0x%lx iip=%lx\n",isr, regs->cr_iip);
        return fault;
 }
 
diff -r e585c2dade14 -r 86e5d8458c08 xen/arch/ia64/xen/vhpt.c
--- a/xen/arch/ia64/xen/vhpt.c  Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/arch/ia64/xen/vhpt.c  Wed Jul 26 09:36:36 2006 -0600
@@ -236,7 +236,7 @@ static void flush_tlb_vhpt_all (struct d
        local_flush_tlb_all ();
 }
 
-void domain_flush_destroy (struct domain *d)
+void domain_flush_tlb_vhpt(struct domain *d)
 {
        /* Very heavy...  */
        on_each_cpu ((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
diff -r e585c2dade14 -r 86e5d8458c08 xen/include/asm-ia64/domain.h
--- a/xen/include/asm-ia64/domain.h     Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/include/asm-ia64/domain.h     Wed Jul 26 09:36:36 2006 -0600
@@ -48,6 +48,9 @@ extern unsigned long domain_set_shared_i
    If sync_only is true, only synchronize I&D caches,
    if false, flush and invalidate caches.  */
 extern void domain_cache_flush (struct domain *d, int sync_only);
+
+/* Control the shadow mode.  */
+extern int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc);
 
 /* Cleanly crash the current domain with a message.  */
 extern void panic_domain(struct pt_regs *, const char *, ...)
@@ -117,6 +120,16 @@ struct arch_domain {
     /* Address of fpswa_interface_t (placed in domain memory)  */
     void *fpswa_inf;
 
+    /* Bitmap of shadow dirty bits.
+       Set iff shadow mode is enabled.  */
+    u64 *shadow_bitmap;
+    /* Length (in bits!) of shadow bitmap.  */
+    unsigned long shadow_bitmap_size;
+    /* Number of bits set in bitmap.  */
+    atomic64_t shadow_dirty_count;
+    /* Number of faults.  */
+    atomic64_t shadow_fault_count;
+
     struct last_vcpu last_vcpu[NR_CPUS];
 };
 #define INT_ENABLE_OFFSET(v)             \
diff -r e585c2dade14 -r 86e5d8458c08 
xen/include/asm-ia64/linux-xen/asm/pgtable.h
--- a/xen/include/asm-ia64/linux-xen/asm/pgtable.h      Wed Jul 26 09:02:43 
2006 -0600
+++ b/xen/include/asm-ia64/linux-xen/asm/pgtable.h      Wed Jul 26 09:36:36 
2006 -0600
@@ -62,7 +62,12 @@
 #define _PAGE_D                        (1 << _PAGE_D_BIT)      /* page dirty 
bit */
 #define _PAGE_PPN_MASK         (((__IA64_UL(1) << IA64_MAX_PHYS_BITS) - 1) & 
~0xfffUL)
 #define _PAGE_ED               (__IA64_UL(1) << 52)    /* exception deferral */
+#ifdef XEN
+#define _PAGE_VIRT_D           (__IA64_UL(1) << 53)    /* Virtual dirty bit */
+#define _PAGE_PROTNONE         0
+#else
 #define _PAGE_PROTNONE         (__IA64_UL(1) << 63)
+#endif
 
 /* Valid only for a PTE with the present bit cleared: */
 #define _PAGE_FILE             (1 << 1)                /* see swap & file pte 
remarks below */
diff -r e585c2dade14 -r 86e5d8458c08 xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h     Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/include/asm-ia64/shadow.h     Wed Jul 26 09:36:36 2006 -0600
@@ -45,6 +45,24 @@ void guest_physmap_remove_page(struct do
 void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned 
long mfn);
 #endif
 
+static inline int
+shadow_mode_enabled(struct domain *d)
+{
+    return d->arch.shadow_bitmap != NULL;
+}
+
+static inline int
+shadow_mark_page_dirty(struct domain *d, unsigned long gpfn)
+{
+    if (gpfn < d->arch.shadow_bitmap_size * 8
+        && !test_and_set_bit(gpfn, d->arch.shadow_bitmap)) {
+        /* The page was not dirty.  */
+        atomic64_inc(&d->arch.shadow_dirty_count);
+        return 1;
+    } else
+        return 0;
+}
+
 #endif // _XEN_SHADOW_H
 
 /*
diff -r e585c2dade14 -r 86e5d8458c08 xen/include/asm-ia64/tlbflush.h
--- a/xen/include/asm-ia64/tlbflush.h   Wed Jul 26 09:02:43 2006 -0600
+++ b/xen/include/asm-ia64/tlbflush.h   Wed Jul 26 09:36:36 2006 -0600
@@ -22,8 +22,8 @@ void domain_flush_vtlb_all (void);
 /* Global range-flush of vTLB.  */
 void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range);
 
-/* Final vTLB flush on every dirty cpus.  */
-void domain_flush_destroy (struct domain *d);
+/* Flush vhpt and mTLB on every dirty cpus.  */
+void domain_flush_tlb_vhpt(struct domain *d);
 
 /* Flush v-tlb on cpus set in mask for current domain.  */
 void flush_tlb_mask(cpumask_t mask);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.