[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] tmem: save/restore/migrate/livemigrate and shared pool authentication



Attached patch implements save/restore/migration/livemigration
for transcendent memory ("tmem").  Without this patch, domains
using tmem may in some cases lose data when doing save/restore
or migrate/livemigrate.  Also included in this patch is
support for a new (privileged) hypercall for authorizing
domains to share pools; this provides the foundation to
accomodate upstream linux requests for security for shared
pools.

Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>

(Inline and attachment in case my mailer botches it.)

=============================
diff -r 5333e6497af6 tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/libxc/xc_domain_restore.c   Wed Aug 05 11:17:18 2009 -0600
@@ -533,6 +533,27 @@ int xc_domain_restore(int xc_handle, int
             }
 
             xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
+            continue;
+        }
+
+        if ( j == -5 )
+        {
+            DPRINTF("xc_domain_restore start tmem\n");
+            if ( xc_tmem_restore(xc_handle, dom, io_fd) )
+            {
+                ERROR("error reading/restoring tmem");
+                goto out;
+            }
+            continue;
+        }
+
+        if ( j == -6 )
+        {
+            if ( xc_tmem_restore_extra(xc_handle, dom, io_fd) )
+            {
+                ERROR("error reading/restoring tmem extra");
+                goto out;
+            }
             continue;
         }
 
diff -r 5333e6497af6 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/libxc/xc_domain_save.c      Wed Aug 05 11:17:18 2009 -0600
@@ -758,6 +758,7 @@ int xc_domain_save(int xc_handle, int io
     int live  = (flags & XCFLAGS_LIVE);
     int debug = (flags & XCFLAGS_DEBUG);
     int race = 0, sent_last_iter, skip_this_iter;
+    int tmem_saved = 0;
 
     /* The new domain's shared-info frame number. */
     unsigned long shared_info_frame;
@@ -995,6 +996,13 @@ int xc_domain_save(int xc_handle, int io
     }
 
     print_stats(xc_handle, dom, 0, &stats, 0);
+
+    tmem_saved = xc_tmem_save(xc_handle, dom, io_fd, live, -5);
+    if ( tmem_saved == -1 )
+    {
+        ERROR("Error when writing to state file (tmem)");
+        goto out;
+    }
 
     /* Now write out each data page, canonicalising page tables as we go... */
     for ( ; ; )
@@ -1316,6 +1324,13 @@ int xc_domain_save(int xc_handle, int io
                 }
 
                 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
+                if ( (tmem_saved > 0) &&
+                     (xc_tmem_save_extra(xc_handle,dom,io_fd,-6) == -1) )
+                {
+                        ERROR("Error when writing to state file (tmem)");
+                        goto out;
+                }
+
             }
 
             if ( xc_shadow_control(xc_handle, dom, 
@@ -1605,6 +1620,9 @@ int xc_domain_save(int xc_handle, int io
 
  out:
 
+    if ( tmem_saved != 0 && live )
+        xc_tmem_save_done(xc_handle, dom);
+
     if ( live )
     {
         if ( xc_shadow_control(xc_handle, dom, 
diff -r 5333e6497af6 tools/libxc/xc_tmem.c
--- a/tools/libxc/xc_tmem.c     Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/libxc/xc_tmem.c     Wed Aug 05 11:17:18 2009 -0600
@@ -36,6 +36,7 @@ int xc_tmem_control(int xc,
                     uint32_t cli_id,
                     uint32_t arg1,
                     uint32_t arg2,
+                    uint64_t arg3,
                     void *buf)
 {
     tmem_op_t op;
@@ -45,9 +46,10 @@ int xc_tmem_control(int xc,
     op.pool_id = pool_id;
     op.u.ctrl.subop = subop;
     op.u.ctrl.cli_id = cli_id;
+    set_xen_guest_handle(op.u.ctrl.buf,buf);
     op.u.ctrl.arg1 = arg1;
     op.u.ctrl.arg2 = arg2;
-    op.u.ctrl.buf.p = buf;
+    op.u.ctrl.arg3 = arg3;
 
     if (subop == TMEMC_LIST) {
         if ((arg1 != 0) && (lock_pages(buf, arg1) != 0))
@@ -72,6 +74,376 @@ int xc_tmem_control(int xc,
     return rc;
 }
 
+static int xc_tmem_uuid_parse(char *uuid_str, uint64_t *uuid_lo, uint64_t 
*uuid_hi)
+{
+    char *p = uuid_str;
+    uint64_t *x = uuid_hi;
+    int i = 0, digit;
+
+    *uuid_lo = 0; *uuid_hi = 0;
+    for ( p = uuid_str, i = 0; i != 36 && *p != '\0'; p++, i++ )
+    {
+        if ( (i == 8 || i == 13 || i == 18 || i == 23) )
+        {
+            if ( *p != '-' )
+                return -1;
+            if ( i == 18 )
+                x = uuid_lo;
+            continue;
+        }
+        else if ( *p >= '0' && *p <= '9' )
+            digit = *p - '0';
+        else if ( *p >= 'A' && *p <= 'F' )
+            digit = *p - 'A';
+        else if ( *p >= 'a' && *p <= 'f' )
+            digit = *p - 'a';
+        else
+            return -1;
+        *x = (*x << 4) | digit;
+    }
+    if ( (i != 1 && i != 36) || *p != '\0' )
+        return -1;
+    return 0;
+}
+
+int xc_tmem_auth(int xc,
+                 int cli_id,
+                 char *uuid_str,
+                 int arg1)
+{
+    tmem_op_t op;
+
+    op.cmd = TMEM_AUTH;
+    op.pool_id = 0;
+    op.u.new.arg1 = cli_id;
+    op.u.new.flags = arg1;
+    if ( xc_tmem_uuid_parse(uuid_str, &op.u.new.uuid[0],
+                                      &op.u.new.uuid[1]) < 0 )
+    {
+        PERROR("Can't parse uuid, use xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx");
+        return -1;
+    }
+
+    return do_tmem_op(xc, &op);
+}
+
+/* Save/restore/live migrate */
+
+/*
+   Note that live migration complicates the save/restore format in
+   multiple ways: Though saving/migration can only occur when all
+   tmem pools belonging to the domain-being-saved are frozen and
+   this ensures that new pools can't be created or existing pools
+   grown (in number of pages), it is possible during a live migration
+   that pools may be destroyed and pages invalidated while the migration
+   is in process.  As a result, (1) it is not safe to pre-specify counts
+   for these values precisely, but only as a "max", and (2) a "invalidation"
+   list (of pools, objects, pages) must be appended when the domain is truly
+   suspended.
+ */
+
+/* returns 0 if nothing to save, -1 if error saving, 1 if saved successfully */
+int xc_tmem_save(int xc, int dom, int io_fd, int live, int field_marker)
+{
+    int marker = field_marker;
+    int i, j;
+    uint32_t max_pools, version;
+    uint32_t weight, cap, flags;
+    uint32_t pool_id;
+    uint32_t minusone = -1;
+    struct tmem_handle *h;
+
+    if ( xc_tmem_control(xc,0,TMEMC_SAVE_BEGIN,dom,live,0,0,NULL) <= 0 )
+        return 0;
+
+    if ( write_exact(io_fd, &marker, sizeof(marker)) )
+        return -1;
+    version = xc_tmem_control(xc,0,TMEMC_SAVE_GET_VERSION,0,0,0,0,NULL);
+    if ( write_exact(io_fd, &version, sizeof(version)) )
+        return -1;
+    max_pools = xc_tmem_control(xc,0,TMEMC_SAVE_GET_MAXPOOLS,0,0,0,0,NULL);
+    if ( write_exact(io_fd, &max_pools, sizeof(max_pools)) )
+        return -1;
+    if ( version == -1 || max_pools == -1 )
+        return -1;
+    if ( write_exact(io_fd, &minusone, sizeof(minusone)) )
+        return -1;
+    flags = xc_tmem_control(xc,0,TMEMC_SAVE_GET_CLIENT_FLAGS,dom,0,0,0,NULL);
+    if ( write_exact(io_fd, &flags, sizeof(flags)) )
+        return -1;
+    weight = xc_tmem_control(xc,0,TMEMC_SAVE_GET_CLIENT_WEIGHT,dom,0,0,0,NULL);
+    if ( write_exact(io_fd, &weight, sizeof(weight)) )
+        return -1;
+    cap = xc_tmem_control(xc,0,TMEMC_SAVE_GET_CLIENT_CAP,dom,0,0,0,NULL);
+    if ( write_exact(io_fd, &cap, sizeof(cap)) )
+        return -1;
+    if ( flags == -1 || weight == -1 || cap == -1 )
+        return -1;
+    if ( write_exact(io_fd, &minusone, sizeof(minusone)) )
+        return -1;
+    for ( i = 0; i < max_pools; i++ )
+    {
+        uint64_t uuid[2];
+        uint32_t n_pages;
+        uint32_t pagesize;
+        char *buf = NULL;
+        int bufsize = 0;
+        int checksum = 0;
+
+        /* get pool id, flags, pagesize, n_pages, uuid */
+        flags = xc_tmem_control(xc,i,TMEMC_SAVE_GET_POOL_FLAGS,dom,0,0,0,NULL);
+        if ( flags != -1 )
+        {
+            pool_id = i;
+            n_pages = 
xc_tmem_control(xc,i,TMEMC_SAVE_GET_POOL_NPAGES,dom,0,0,0,NULL);
+            if ( !(flags & TMEM_POOL_PERSIST) )
+                n_pages = 0;
+            
(void)xc_tmem_control(xc,i,TMEMC_SAVE_GET_POOL_UUID,dom,sizeof(uuid),0,0,&uuid);
+            if ( write_exact(io_fd, &pool_id, sizeof(pool_id)) )
+                return -1;
+            if ( write_exact(io_fd, &flags, sizeof(flags)) )
+                return -1;
+            if ( write_exact(io_fd, &n_pages, sizeof(n_pages)) )
+                return -1;
+            if ( write_exact(io_fd, &uuid, sizeof(uuid)) )
+                return -1;
+            if ( n_pages == 0 )
+                continue;
+
+            pagesize = 1 << (((flags >> TMEM_POOL_PAGESIZE_SHIFT) &
+                              TMEM_POOL_PAGESIZE_MASK) + 12);
+            if ( pagesize > bufsize )
+            {
+                bufsize = pagesize + sizeof(struct tmem_handle);
+                if ( (buf = realloc(buf,bufsize)) == NULL )
+                    return -1;
+            }
+            for ( j = n_pages; j > 0; j-- )
+            {
+                int ret;
+                if ( (ret = xc_tmem_control(xc, pool_id,
+                                            TMEMC_SAVE_GET_NEXT_PAGE, dom,
+                                            bufsize, 0, 0, buf)) > 0 )
+                {
+                    h = (struct tmem_handle *)buf;
+                    if ( write_exact(io_fd, &h->oid, sizeof(h->oid)) )
+                        return -1;
+                    if ( write_exact(io_fd, &h->index, sizeof(h->index)) )
+                        return -1;
+                    h++;
+                    checksum += *(char *)h;
+                    if ( write_exact(io_fd, h, pagesize) )
+                        return -1;
+                } else if ( ret == 0 ) {
+                    continue;
+                } else {
+                    /* page list terminator */
+                    h = (struct tmem_handle *)buf;
+                    h->oid = -1;
+                    if ( write_exact(io_fd, &h->oid, sizeof(h->oid)) )
+                        return -1;
+                    break;
+                }
+            }
+            DPRINTF("saved %d tmem pages for dom=%d pool=%d, checksum=%x\n",
+                         n_pages-j,dom,pool_id,checksum);
+        }
+    }
+    /* pool list terminator */
+    minusone = -1;
+    if ( write_exact(io_fd, &minusone, sizeof(minusone)) )
+        return -1;
+
+    return 1;
+}
+
+/* only called for live migration */
+int xc_tmem_save_extra(int xc, int dom, int io_fd, int field_marker)
+{
+    struct tmem_handle handle;
+    int marker = field_marker;
+    uint32_t minusone;
+    int count = 0, checksum = 0;
+
+    if ( write_exact(io_fd, &marker, sizeof(marker)) )
+        return -1;
+    while ( xc_tmem_control(xc, 0, TMEMC_SAVE_GET_NEXT_INV, dom,
+                            sizeof(handle),0,0,&handle) > 0 ) {
+        if ( write_exact(io_fd, &handle.pool_id, sizeof(handle.pool_id)) )
+            return -1;
+        if ( write_exact(io_fd, &handle.oid, sizeof(handle.oid)) )
+            return -1;
+        if ( write_exact(io_fd, &handle.index, sizeof(handle.index)) )
+            return -1;
+        count++;
+        checksum += handle.pool_id + handle.oid + handle.index;
+    }
+    if ( count )
+            DPRINTF("needed %d tmem invalidates, check=%d\n",count,checksum);
+    minusone = -1;
+    if ( write_exact(io_fd, &minusone, sizeof(minusone)) )
+        return -1;
+    return 0;
+}
+
+/* only called for live migration */
+void xc_tmem_save_done(int xc, int dom)
+{
+    xc_tmem_control(xc,0,TMEMC_SAVE_END,dom,0,0,0,NULL);
+}
+
+/* restore routines */
+
+static int xc_tmem_restore_new_pool(
+                    int xc,
+                    int cli_id,
+                    uint32_t pool_id,
+                    uint32_t flags,
+                    uint64_t uuid_lo,
+                    uint64_t uuid_hi)
+{
+    tmem_op_t op;
+
+    op.cmd = TMEM_RESTORE_NEW;
+    op.pool_id = pool_id;
+    op.u.new.arg1 = cli_id;
+    op.u.new.flags = flags;
+    op.u.new.uuid[0] = uuid_lo;
+    op.u.new.uuid[1] = uuid_hi;
+
+    return do_tmem_op(xc, &op);
+}
+
+int xc_tmem_restore(int xc, int dom, int io_fd)
+{
+    uint32_t save_max_pools, save_version;
+    uint32_t this_max_pools, this_version;
+    uint32_t pool_id;
+    uint32_t minusone;
+    uint32_t weight, cap, flags;
+    int checksum = 0;
+
+    save_version = xc_tmem_control(xc,0,TMEMC_SAVE_GET_VERSION,dom,0,0,0,NULL);
+    if ( save_version == -1 )
+        return -1; /* domain doesn't exist */
+    save_max_pools = 
xc_tmem_control(xc,0,TMEMC_SAVE_GET_MAXPOOLS,0,0,0,0,NULL);
+    if ( read_exact(io_fd, &this_version, sizeof(this_version)) )
+        return -1;
+    if ( read_exact(io_fd, &this_max_pools, sizeof(this_max_pools)) )
+        return -1;
+    /* FIXME check here to ensure no version mismatch or maxpools mismatch */
+    if ( read_exact(io_fd, &minusone, sizeof(minusone)) )
+        return -1;
+    if ( minusone != -1 )
+        return -1;
+    if ( xc_tmem_control(xc,0,TMEMC_RESTORE_BEGIN,dom,0,0,0,NULL) < 0 )
+        return -1;
+    if ( read_exact(io_fd, &flags, sizeof(flags)) )
+        return -1;
+    if ( flags & TMEM_CLIENT_COMPRESS )
+        if ( xc_tmem_control(xc,0,TMEMC_SET_COMPRESS,dom,1,0,0,NULL) < 0 )
+            return -1;
+    if ( flags & TMEM_CLIENT_FROZEN )
+        if ( xc_tmem_control(xc,0,TMEMC_FREEZE,dom,0,0,0,NULL) < 0 )
+            return -1;
+    if ( read_exact(io_fd, &weight, sizeof(weight)) )
+        return -1;
+    if ( xc_tmem_control(xc,0,TMEMC_SET_WEIGHT,dom,0,0,0,NULL) < 0 )
+        return -1;
+    if ( read_exact(io_fd, &cap, sizeof(cap)) )
+        return -1;
+    if ( xc_tmem_control(xc,0,TMEMC_SET_CAP,dom,0,0,0,NULL) < 0 )
+        return -1;
+    if ( read_exact(io_fd, &minusone, sizeof(minusone)) )
+        return -1;
+    while ( read_exact(io_fd, &pool_id, sizeof(pool_id)) == 0 && pool_id != -1 
)
+    {
+        uint64_t uuid[2];
+        uint32_t n_pages;
+        char *buf = NULL;
+        int bufsize = 0, pagesize;
+        int j;
+
+        if ( read_exact(io_fd, &flags, sizeof(flags)) )
+            return -1;
+        if ( read_exact(io_fd, &n_pages, sizeof(n_pages)) )
+            return -1;
+        if ( read_exact(io_fd, &uuid, sizeof(uuid)) )
+            return -1;
+        if ( xc_tmem_restore_new_pool(xc, dom, pool_id,
+                                 flags, uuid[0], uuid[1]) < 0)
+            return -1;
+        if ( n_pages <= 0 )
+            continue;
+
+        pagesize = 1 << (((flags >> TMEM_POOL_PAGESIZE_SHIFT) &
+                              TMEM_POOL_PAGESIZE_MASK) + 12);
+        if ( pagesize > bufsize )
+        {
+            bufsize = pagesize;
+            if ( (buf = realloc(buf,bufsize)) == NULL )
+                return -1;
+        }
+        for ( j = n_pages; j > 0; j-- )
+        {
+            uint64_t oid;
+            uint32_t index;
+            int rc;
+            if ( read_exact(io_fd, &oid, sizeof(oid)) )
+                return -1;
+            if ( oid == -1 )
+                break;
+            if ( read_exact(io_fd, &index, sizeof(index)) )
+                return -1;
+            if ( read_exact(io_fd, buf, pagesize) )
+                return -1;
+            checksum += *buf;
+            if ( (rc = xc_tmem_control(xc, pool_id, TMEMC_RESTORE_PUT_PAGE,
+                                 dom, bufsize, index, oid, buf)) <= 0 )
+            {
+                DPRINTF("xc_tmem_restore: putting page failed, rc=%d\n",rc);
+                return -1;
+            }
+        }
+        if ( n_pages )
+            DPRINTF("restored %d tmem pages for dom=%d pool=%d, check=%x\n",
+                    n_pages-j,dom,pool_id,checksum);
+    }
+    if ( pool_id != -1 )
+        return -1;
+
+    return 0;
+}
+
+/* only called for live migration, must be called after suspend */
+int xc_tmem_restore_extra(int xc, int dom, int io_fd)
+{
+    uint32_t pool_id;
+    uint64_t oid;
+    uint32_t index;
+    int count = 0;
+    int checksum = 0;
+
+    while ( read_exact(io_fd, &pool_id, sizeof(pool_id)) == 0 && pool_id != -1 
)
+    {
+        if ( read_exact(io_fd, &oid, sizeof(oid)) )
+            return -1;
+        if ( read_exact(io_fd, &index, sizeof(index)) )
+            return -1;
+        if ( xc_tmem_control(xc, pool_id, TMEMC_RESTORE_FLUSH_PAGE, dom,
+                             0,index,oid,NULL) <= 0 )
+            return -1;
+        count++;
+        checksum += pool_id + oid + index;
+    }
+    if ( pool_id != -1 )
+        return -1;
+    if ( count )
+            DPRINTF("invalidated %d tmem pages, check=%d\n",count,checksum);
+
+    return 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff -r 5333e6497af6 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/libxc/xenctrl.h     Wed Aug 05 11:17:18 2009 -0600
@@ -1276,12 +1276,13 @@ int xc_set_cpuidle_max_cstate(int xc_han
 /**
  * tmem operations
  */
-int xc_tmem_control(int xc,
-                    int32_t pool_id,
-                    uint32_t subop,
-                    uint32_t cli_id,
-                    uint32_t arg1,
-                    uint32_t arg2,
-                    void *buf);
+int xc_tmem_control(int xc, int32_t pool_id, uint32_t subop, uint32_t cli_id,
+                    uint32_t arg1, uint32_t arg2, uint64_t arg3, void *buf);
+int xc_tmem_auth(int xc_handle, int cli_id, char *uuid_str, int arg1);
+int xc_tmem_save(int xc_handle, int dom, int live, int fd, int field_marker);
+int xc_tmem_save_extra(int xc_handle, int dom, int fd, int field_marker);
+void xc_tmem_save_done(int xc_handle, int dom);
+int xc_tmem_restore(int xc_handle, int dom, int fd);
+int xc_tmem_restore_extra(int xc_handle, int dom, int fd);
 
 #endif /* XENCTRL_H */
diff -r 5333e6497af6 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Aug 05 11:17:18 2009 -0600
@@ -1523,20 +1523,21 @@ static PyObject *pyxc_tmem_control(XcObj
     uint32_t cli_id;
     uint32_t arg1;
     uint32_t arg2;
+    uint64_t arg3;
     char *buf;
     char _buffer[32768], *buffer = _buffer;
     int rc;
 
-    static char *kwd_list[] = { "pool_id", "subop", "cli_id", "arg1", "arg2", 
"buf", NULL };
+    static char *kwd_list[] = { "pool_id", "subop", "cli_id", "arg1", "arg2", 
"arg3", "buf", NULL };
 
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiiis", kwd_list,
-                                      &pool_id, &subop, &cli_id, &arg1, &arg2, 
&buf) )
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiiiis", kwd_list,
+                        &pool_id, &subop, &cli_id, &arg1, &arg2, &arg3, &buf) )
         return NULL;
 
     if ( (subop == TMEMC_LIST) && (arg1 > 32768) )
         arg1 = 32768;
 
-    if ( (rc = xc_tmem_control(self->xc_handle, pool_id, subop, cli_id, arg1, 
arg2, buffer)) < 0 )
+    if ( (rc = xc_tmem_control(self->xc_handle, pool_id, subop, cli_id, arg1, 
arg2, arg3, buffer)) < 0 )
         return Py_BuildValue("i", rc);
 
     switch (subop) {
@@ -1553,6 +1554,28 @@ static PyObject *pyxc_tmem_control(XcObj
         default:
             break;
     }
+
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_tmem_shared_auth(XcObject *self,
+                                   PyObject *args,
+                                   PyObject *kwds)
+{
+    uint32_t cli_id;
+    uint32_t arg1;
+    char *uuid_str;
+    int rc;
+
+    static char *kwd_list[] = { "cli_id", "uuid_str", "arg1" };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "isi", kwd_list,
+                                   &cli_id, &uuid_str, &arg1) )
+        return NULL;
+
+    if ( (rc = xc_tmem_auth(self->xc_handle, cli_id, uuid_str, arg1)) < 0 )
+        return Py_BuildValue("i", rc);
 
     Py_INCREF(zero);
     return zero;
@@ -2029,6 +2052,15 @@ static PyMethodDef pyxc_methods[] = {
       " buf [str]: Buffer.\n\n"
       "Returns: [int] 0 or [str] tmem info on success; exception on error.\n" 
},
 
+    { "tmem_shared_auth",
+      (PyCFunction)pyxc_tmem_shared_auth,
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "De/authenticate a shared tmem pool.\n"
+      " cli_id [int]: Client identifier (-1 == all).\n"
+      " uuid_str [str]: uuid.\n"
+      " auth [int]: 0|1 .\n"
+      "Returns: [int] 0 on success; exception on error.\n" },
+
     { NULL, NULL, 0, NULL }
 };
 
diff -r 5333e6497af6 tools/python/xen/xend/XendAPI.py
--- a/tools/python/xen/xend/XendAPI.py  Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/python/xen/xend/XendAPI.py  Wed Aug 05 11:17:18 2009 -0600
@@ -933,7 +933,8 @@ class XendAPI(object):
                     ('tmem_list', None),
                     ('tmem_set_weight', None),
                     ('tmem_set_cap', None),
-                    ('tmem_set_compress', None)]
+                    ('tmem_set_compress', None),
+                    ('tmem_shared_auth', None)]
     
     host_funcs = [('get_by_name_label', None),
                   ('list_methods', None)]
@@ -1129,6 +1130,14 @@ class XendAPI(object):
         node = XendNode.instance()
         try:
             node.tmem_set_compress(cli_id, value)
+        except Exception, e:
+            return xen_api_error(e)
+        return xen_api_success_void()
+
+    def host_tmem_shared_auth(self, _, host_ref, cli_id, uuid_str, auth):
+        node = XendNode.instance()
+        try:
+            node.tmem_shared_auth(cli_id, uuid_str, auth)
         except Exception, e:
             return xen_api_error(e)
         return xen_api_success_void()
diff -r 5333e6497af6 tools/python/xen/xend/XendNode.py
--- a/tools/python/xen/xend/XendNode.py Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/python/xen/xend/XendNode.py Wed Aug 05 11:17:18 2009 -0600
@@ -949,62 +949,72 @@ class XendNode:
         subop = TMEMC_LIST
         arg1 = 32768
         arg2 = use_long
+        arg3 = 0
         buf = ''
-        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, arg3, 
buf)
 
     def tmem_thaw(self, cli_id):
         pool_id = -1
         subop = TMEMC_THAW
         arg1 = 0
         arg2 = 0
+        arg3 = 0
         buf = ''
-        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, arg3, 
buf)
 
     def tmem_freeze(self, cli_id):
         pool_id = -1
         subop = TMEMC_FREEZE
         arg1 = 0
         arg2 = 0
+        arg3 = 0
         buf = ''
-        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, arg3, 
buf)
 
     def tmem_flush(self, cli_id, pages):
         pool_id = -1
         subop = TMEMC_FLUSH
         arg1 = pages
         arg2 = 0
+        arg3 = 0
         buf = ''
-        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, arg3, 
buf)
 
     def tmem_destroy(self, cli_id):
         pool_id = -1
         subop = TMEMC_DESTROY
         arg1 = 0
         arg2 = 0
+        arg3 = 0
         buf = ''
-        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, arg3, 
buf)
 
     def tmem_set_weight(self, cli_id, arg1):
         pool_id = -1
         subop = TMEMC_SET_WEIGHT
         arg2 = 0
+        arg3 = 0
         buf = ''
-        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, arg3, 
buf)
 
     def tmem_set_cap(self, cli_id, arg1):
         pool_id = -1
         subop = TMEMC_SET_CAP
         arg2 = 0
+        arg3 = 0
         buf = ''
-        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, arg3, 
buf)
 
     def tmem_set_compress(self, cli_id, arg1):
         pool_id = -1
         subop = TMEMC_SET_COMPRESS
         arg2 = 0
+        arg3 = 0
         buf = ''
-        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, buf)
+        return self.xc.tmem_control(pool_id, subop, cli_id, arg1, arg2, arg3, 
buf)
 
+    def tmem_shared_auth(self, cli_id, uuid_str, auth):
+        return self.xc.tmem_auth(cli_id, uuid_str, auth)
 
 def instance():
     global inst
diff -r 5333e6497af6 tools/python/xen/xend/balloon.py
--- a/tools/python/xen/xend/balloon.py  Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/python/xen/xend/balloon.py  Wed Aug 05 11:17:18 2009 -0600
@@ -111,7 +111,7 @@ def free(need_mem, dominfo):
         rlimit = RETRY_LIMIT
 
         # stop tmem from absorbing any more memory (must THAW when done!)
-        xc.tmem_control(0,TMEMC_FREEZE,-1, 0, 0, "")
+        xc.tmem_control(0,TMEMC_FREEZE,-1, 0, 0, 0, "")
 
         # If unreasonable memory size is required, we give up waiting
         # for ballooning or scrubbing, as if had retried.
@@ -130,7 +130,7 @@ def free(need_mem, dominfo):
         if freeable_mem < need_mem and need_mem < max_free_mem:
             # flush memory from tmem to scrub_mem and reobtain physinfo
             need_tmem_kb = need_mem - freeable_mem
-            tmem_kb = xc.tmem_control(0,TMEMC_FLUSH,-1, need_tmem_kb, 0, "")
+            tmem_kb = xc.tmem_control(0,TMEMC_FLUSH,-1, need_tmem_kb, 0, 0, "")
             log.debug("Balloon: tmem relinquished %d KiB of %d KiB requested.",
                       tmem_kb, need_tmem_kb)
             physinfo = xc.physinfo()
@@ -232,5 +232,5 @@ def free(need_mem, dominfo):
 
     finally:
         # allow tmem to accept pages again
-        xc.tmem_control(0,TMEMC_THAW,-1, 0, 0, "")
+        xc.tmem_control(0,TMEMC_THAW,-1, 0, 0, 0, "")
         del xc
diff -r 5333e6497af6 tools/python/xen/xend/server/XMLRPCServer.py
--- a/tools/python/xen/xend/server/XMLRPCServer.py      Mon Jul 20 15:45:50 
2009 +0100
+++ b/tools/python/xen/xend/server/XMLRPCServer.py      Wed Aug 05 11:17:18 
2009 -0600
@@ -202,7 +202,8 @@ class XMLRPCServer:
                               ['info', 'pciinfo', 'send_debug_keys',
                                'tmem_list', 'tmem_freeze', 'tmem_thaw',
                                'tmem_flush', 'tmem_destroy', 'tmem_set_weight',
-                               'tmem_set_cap', 'tmem_set_compress'],
+                               'tmem_set_cap', 'tmem_set_compress',
+                               'tmem_shared_auth'],
                              'node'),
                              (XendDmesg, ['info', 'clear'], 'node.dmesg')]:
             inst = type.instance()
diff -r 5333e6497af6 tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Mon Jul 20 15:45:50 2009 +0100
+++ b/tools/python/xen/xm/main.py       Wed Aug 05 11:17:18 2009 -0600
@@ -207,6 +207,7 @@ SUBCOMMAND_HELP = {
     'tmem-set'      :  ('[<Domain>|-a|--all] [weight=<weight>] [cap=<cap>] '
                         '[compress=<compress>]',
                         'Change tmem settings.'),
+    'tmem-shared-auth' :  ('[<Domain>|-a|--all] [--uuid=<uuid>] 
[--auth=<0|1>]', 'De/authenticate shared tmem pool.'),
 
     # security
 
@@ -306,6 +307,11 @@ SUBCOMMAND_OPTIONS = {
     ),
     'tmem-set':  (
        ('-a', '--all', 'Operate on all tmem.'),
+    ),
+    'tmem-shared-auth':  (
+       ('-a', '--all', 'Authenticate for all tmem pools.'),
+       ('-u', '--uuid', 'Specify uuid 
(abcdef01-2345-6789-01234567890abcdef).'),
+       ('-A', '--auth', '0=auth,1=deauth'),
     ),
 }
 
@@ -427,6 +433,7 @@ tmem_commands = [
     "tmem-freeze",
     "tmem-destroy",
     "tmem-set",
+    "tmem-shared-auth",
     ]
 
 all_commands = (domain_commands + host_commands + scheduler_commands +
@@ -3128,6 +3135,46 @@ def xm_tmem_set(args):
         if compress is not None:
             server.xend.node.tmem_set_compress(domid, compress)
 
+def xm_tmem_shared_auth(args):
+    try:
+        (options, params) = getopt.gnu_getopt(args, 'au:A:', 
['all','uuid=','auth='])
+    except getopt.GetoptError, opterr:
+        err(opterr)
+       usage('tmem-shared-auth')
+
+    all = False
+    for (k, v) in options:
+        if k in ['-a', '--all']:
+            all = True
+
+    if not all and len(params) == 0:
+        err('You must specify -a or --all or a domain id.')
+        usage('tmem-shared-auth')
+
+    if all:
+        domid = -1
+    else:
+        try: 
+            domid = int(params[0])
+            params = params[1:]
+        except:
+            err('Unrecognized domain id: %s' % params[0])
+            usage('tmem-shared-auth')
+
+    for (k, v) in options:
+        if k in ['-u', '--uuid']:
+             uuid_str = v
+
+    auth = 0
+    for (k, v) in options:
+        if k in ['-A', '--auth']:
+            auth = v
+
+    if serverType == SERVER_XEN_API:
+        return server.xenapi.host.tmem_shared_auth(domid,uuid_str,auth)
+    else:
+        return server.xend.node.tmem_shared_auth(domid,uuid_str,auth)
+
 
 commands = {
     "shell": xm_shell,
@@ -3210,6 +3257,7 @@ commands = {
     "tmem-destroy": xm_tmem_destroy,
     "tmem-list": xm_tmem_list,
     "tmem-set": xm_tmem_set,
+    "tmem-shared-auth": xm_tmem_shared_auth,
     }
 
 ## The commands supported by a separate argument parser in xend.xm.
diff -r 5333e6497af6 xen/common/tmem.c
--- a/xen/common/tmem.c Mon Jul 20 15:45:50 2009 +0100
+++ b/xen/common/tmem.c Wed Aug 05 11:17:18 2009 -0600
@@ -26,6 +26,8 @@
 
 #define EXPORT /* indicates code other modules are dependent upon */
 #define FORWARD
+
+#define TMEM_SPEC_VERSION 0
 
 /************  INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
 
@@ -105,6 +107,7 @@ DECL_CYC_COUNTER(decompress);
 #define MAX_GLOBAL_SHARED_POOLS  16
 
 struct tm_pool;
+struct tmem_page_descriptor;
 struct client {
     struct list_head client_list;
     struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
@@ -116,11 +119,20 @@ struct client {
     uint32_t cap;
     bool_t compress;
     bool_t frozen;
+    bool_t shared_auth_required;
+    /* for save/restore/migration */
+    bool_t live_migrating;
+    bool_t was_frozen;
+    struct list_head persistent_invalidated_list;
+    struct tmem_page_descriptor *cur_pgp;
+    /* statistics collection */
     unsigned long compress_poor, compress_nomem;
     unsigned long compressed_pages;
     uint64_t compressed_sum_size;
     uint64_t total_cycles;
     unsigned long succ_pers_puts, succ_eph_gets, succ_pers_gets;
+    /* shared pool authentication */
+    uint64_t shared_auth_uuid[MAX_GLOBAL_SHARED_POOLS][2];
 };
 typedef struct client client_t;
 
@@ -137,6 +149,7 @@ struct tm_pool {
 struct tm_pool {
     bool_t shared;
     bool_t persistent;
+    int pageshift; /* 0 == 2**12 */
     struct list_head pool_list; /* FIXME do we need this anymore? */
     client_t *client;
     uint64_t uuid[2]; /* 0 for private, non-zero for shared */
@@ -144,8 +157,11 @@ struct tm_pool {
     rwlock_t pool_rwlock;
     struct rb_root obj_rb_root[OBJ_HASH_BUCKETS]; /* protected by pool_rwlock 
*/
     struct list_head share_list; /* valid if shared */
-    DECL_SENTINEL
     int shared_count; /* valid if shared */
+    /* for save/restore/migration */
+    struct list_head persistent_page_list;
+    struct tmem_page_descriptor *cur_pgp;
+    /* statistics collection */
     atomic_t pgp_count;
     int pgp_count_max;
     long obj_count;  /* atomicity depends on pool_rwlock held for write */
@@ -158,6 +174,7 @@ struct tm_pool {
     unsigned long gets, found_gets;
     unsigned long flushs, flushs_found;
     unsigned long flush_objs, flush_objs_found;
+    DECL_SENTINEL
 };
 typedef struct tm_pool pool_t;
 
@@ -189,16 +206,29 @@ typedef struct tmem_object_node objnode_
 typedef struct tmem_object_node objnode_t;
 
 struct tmem_page_descriptor {
-    struct list_head global_eph_pages;
-    struct list_head client_eph_pages;
-    obj_t *obj;
+    union {
+        struct list_head global_eph_pages;
+        struct list_head client_inv_pages;
+    };
+    union {
+        struct list_head client_eph_pages;
+        struct list_head pool_pers_pages;
+    };
+    union {
+        obj_t *obj;
+        uint64_t inv_oid;  /* used for invalid list only */
+    };
     uint32_t index;
-    size_t size; /* 0 == PAGE_SIZE (pfp), else compressed data (cdata) */
+    size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
+                    else compressed data (cdata) */
     union {
         pfp_t *pfp;  /* page frame pointer */
         char *cdata; /* compressed data */
     };
-    uint64_t timestamp;
+    union {
+        uint64_t timestamp;
+        uint32_t pool_id;  /* used for invalid list only */
+    };
     DECL_SENTINEL
 };
 typedef struct tmem_page_descriptor pgp_t;
@@ -209,6 +239,7 @@ static LIST_HEAD(global_pool_list);
 static LIST_HEAD(global_pool_list);
 
 static pool_t *global_shared_pools[MAX_GLOBAL_SHARED_POOLS] = { 0 };
+static bool_t global_shared_auth = 0;
 static atomic_t client_weight_total = ATOMIC_INIT(0);
 static int tmem_initialized = 0;
 
@@ -217,6 +248,7 @@ EXPORT DEFINE_SPINLOCK(tmem_spinlock);  
 EXPORT DEFINE_SPINLOCK(tmem_spinlock);  /* used iff tmh_lock_all */
 EXPORT DEFINE_RWLOCK(tmem_rwlock);      /* used iff !tmh_lock_all */
 static DEFINE_SPINLOCK(eph_lists_spinlock); /* protects global AND clients */
+static DEFINE_SPINLOCK(pers_lists_spinlock);
 
 #define tmem_spin_lock(_l)  do {if (!tmh_lock_all) spin_lock(_l);}while(0)
 #define tmem_spin_unlock(_l)  do {if (!tmh_lock_all) spin_unlock(_l);}while(0)
@@ -366,36 +398,61 @@ static NOINLINE void pgp_free(pgp_t *pgp
     ASSERT(pgp->obj != NULL);
     ASSERT_SENTINEL(pgp->obj,OBJ);
     ASSERT_SENTINEL(pgp->obj->pool,POOL);
-    ASSERT(list_empty(&pgp->global_eph_pages));
-    ASSERT(list_empty(&pgp->client_eph_pages));
+    ASSERT(pgp->obj->pool->client != NULL);
     if ( from_delete )
         ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
     ASSERT(pgp->obj->pool != NULL);
     pool = pgp->obj->pool;
+    if ( is_ephemeral(pool) )
+    {
+        ASSERT(list_empty(&pgp->global_eph_pages));
+        ASSERT(list_empty(&pgp->client_eph_pages));
+    }
     pgp_free_data(pgp, pool);
+    atomic_dec_and_assert(global_pgp_count);
+    atomic_dec_and_assert(pool->pgp_count);
+    pgp->size = -1;
+    if ( is_persistent(pool) && pool->client->live_migrating )
+    {
+        pgp->inv_oid = pgp->obj->oid;
+        pgp->pool_id = pool->pool_id;
+        return;
+    }
     INVERT_SENTINEL(pgp,PGD);
     pgp->obj = NULL;
     pgp->index = -1;
-    pgp->size = -1;
-    atomic_dec_and_assert(global_pgp_count);
-    atomic_dec_and_assert(pool->pgp_count);
+    tmem_free(pgp,sizeof(pgp_t),pool);
+}
+
+static NOINLINE void pgp_free_from_inv_list(client_t *client, pgp_t *pgp)
+{
+    pool_t *pool = client->pools[pgp->pool_id];
+
+    ASSERT_SENTINEL(pool,POOL);
+    ASSERT_SENTINEL(pgp,PGD);
+    INVERT_SENTINEL(pgp,PGD);
+    pgp->obj = NULL;
+    pgp->index = -1;
     tmem_free(pgp,sizeof(pgp_t),pool);
 }
 
 /* remove the page from appropriate lists but not from parent object */
 static void pgp_delist(pgp_t *pgp, bool_t no_eph_lock)
 {
+    client_t *client;
+
     ASSERT(pgp != NULL);
     ASSERT(pgp->obj != NULL);
     ASSERT(pgp->obj->pool != NULL);
-    ASSERT(pgp->obj->pool->client != NULL);
+    client = pgp->obj->pool->client;
+    ASSERT(client != NULL);
     if ( is_ephemeral(pgp->obj->pool) )
     {
         if ( !no_eph_lock )
             tmem_spin_lock(&eph_lists_spinlock);
         if ( !list_empty(&pgp->client_eph_pages) )
-            pgp->obj->pool->client->eph_count--;
-        ASSERT(pgp->obj->pool->client->eph_count >= 0);
+            client->eph_count--;
+        ASSERT(client->eph_count >= 0);
         list_del_init(&pgp->client_eph_pages);
         if ( !list_empty(&pgp->global_eph_pages) )
             global_eph_count--;
@@ -403,6 +460,20 @@ static void pgp_delist(pgp_t *pgp, bool_
         list_del_init(&pgp->global_eph_pages);
         if ( !no_eph_lock )
             tmem_spin_unlock(&eph_lists_spinlock);
+    } else {
+        if ( client->live_migrating )
+        {
+            tmem_spin_lock(&pers_lists_spinlock);
+            list_add_tail(&pgp->client_inv_pages,
+                          &client->persistent_invalidated_list);
+            if ( pgp != pgp->obj->pool->cur_pgp )
+                list_del_init(&pgp->pool_pers_pages);
+            tmem_spin_unlock(&pers_lists_spinlock);
+        } else {
+            tmem_spin_lock(&pers_lists_spinlock);
+            list_del_init(&pgp->pool_pers_pages);
+            tmem_spin_unlock(&pers_lists_spinlock);
+        }
     }
 }
 
@@ -564,6 +635,7 @@ static NOINLINE void obj_free(obj_t *obj
     ASSERT(obj->pgp_count == 0);
     pool = obj->pool;
     ASSERT(pool != NULL);
+    ASSERT(pool->client != NULL);
     ASSERT_WRITELOCK(&pool->pool_rwlock);
     if ( obj->tree_root.rnode != NULL ) /* may be a "stump" with no leaves */
         radix_tree_destroy(&obj->tree_root, pgp_destroy, rtn_free);
@@ -685,11 +757,14 @@ static pool_t * pool_alloc(void)
     for (i = 0; i < OBJ_HASH_BUCKETS; i++)
         pool->obj_rb_root[i] = RB_ROOT;
     INIT_LIST_HEAD(&pool->pool_list);
+    INIT_LIST_HEAD(&pool->persistent_page_list);
+    pool->cur_pgp = NULL;
     rwlock_init(&pool->pool_rwlock);
     pool->pgp_count_max = pool->obj_count_max = 0;
     pool->objnode_count = pool->objnode_count_max = 0;
     atomic_set(&pool->pgp_count,0);
-    pool->obj_count = 0;
+    pool->obj_count = 0; pool->shared_count = 0;
+    pool->pageshift = PAGE_SHIFT - 12;
     pool->good_puts = pool->puts = pool->dup_puts_flushed = 0;
     pool->dup_puts_replaced = pool->no_mem_puts = 0;
     pool->found_gets = pool->gets = 0;
@@ -805,6 +880,12 @@ static void pool_flush(pool_t *pool, cli
         is_persistent(pool) ? "persistent" : "ephemeral" ,
         is_shared(pool) ? "shared" : "private");
     printk("%s=%d pool_id=%d\n", 
cli_id_str,pool->client->cli_id,pool->pool_id);
+    if ( pool->client->live_migrating )
+    {
+        printk("can't %s pool while %s is live-migrating\n",
+               destroy?"destroy":"flush", client_str);
+        return;
+    }
     pool_destroy_objs(pool,0,CLI_ID_NULL);
     if ( destroy )
     {
@@ -815,10 +896,10 @@ static void pool_flush(pool_t *pool, cli
 
 /************ CLIENT MANIPULATION OPERATIONS **************************/
 
-static client_t *client_create(void)
+static client_t *client_create(cli_id_t cli_id)
 {
     client_t *client = tmem_malloc(client_t,NULL);
-    cli_id_t cli_id = tmh_get_cli_id_from_current();
+    int i;
 
     printk("tmem: initializing tmem capability for 
%s=%d...",cli_id_str,cli_id);
     if ( client == NULL )
@@ -834,15 +915,23 @@ static client_t *client_create(void)
             tmem_free(client,sizeof(client_t),NULL);
         return NULL;
     }
-    tmh_set_current_client(client);
+    tmh_set_client_from_id(client,cli_id);
     client->cli_id = cli_id;
 #ifdef __i386__
     client->compress = 0;
 #else
     client->compress = tmh_compression_enabled();
 #endif
+    client->shared_auth_required = tmh_shared_auth();
+    for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
+        client->shared_auth_uuid[i][0] =
+            client->shared_auth_uuid[i][1] = -1L;
+    client->frozen = 0; client->live_migrating = 0;
+    client->weight = 0; client->cap = 0;
     list_add_tail(&client->client_list, &global_client_list);
     INIT_LIST_HEAD(&client->ephemeral_page_list);
+    INIT_LIST_HEAD(&client->persistent_invalidated_list);
+    client->cur_pgp = NULL;
     client->eph_count = client->eph_count_max = 0;
     client->total_cycles = 0; client->succ_pers_puts = 0;
     client->succ_eph_gets = 0; client->succ_pers_gets = 0;
@@ -885,6 +974,11 @@ static bool_t client_over_quota(client_t
         return 0;
     return ( ((global_eph_count*100L) / client->eph_count ) >
              ((total*100L) / client->weight) );
+}
+
+static void client_freeze(client_t *client, int freeze)
+{
+    client->frozen = freeze;
 }
 
 /************ MEMORY REVOCATION ROUTINES *******************************/
@@ -993,7 +1087,8 @@ static unsigned long tmem_relinquish_npa
 
 /************ TMEM CORE OPERATIONS ************************************/
 
-static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn)
+static NOINLINE int do_tmem_put_compress(pgp_t *pgp, tmem_cli_mfn_t cmfn,
+                                         void *cva)
 {
     void *dst, *p;
     size_t size;
@@ -1011,7 +1106,7 @@ static NOINLINE int do_tmem_put_compress
     if ( pgp->pfp != NULL )
         pgp_free_data(pgp, pgp->obj->pool);  /* FIXME... is this right? */
     START_CYC_COUNTER(compress);
-    ret = tmh_compress_from_client(cmfn, &dst, &size);
+    ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
     if ( (ret == -EFAULT) || (ret == 0) )
         goto out;
     else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
@@ -1034,7 +1129,7 @@ out:
 }
 
 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
-              uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
+       uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva)
 {
     pool_t *pool;
     obj_t *obj;
@@ -1042,7 +1137,6 @@ static NOINLINE int do_tmem_dup_put(pgp_
     pgp_t *pgpfound = NULL;
     int ret;
 
-    /* if we can successfully manipulate pgp to change out the data, do so */
     ASSERT(pgp != NULL);
     ASSERT(pgp->pfp != NULL);
     ASSERT(pgp->size != -1);
@@ -1052,10 +1146,12 @@ static NOINLINE int do_tmem_dup_put(pgp_
     pool = obj->pool;
     ASSERT(pool != NULL);
     client = pool->client;
-    if ( len != 0 && tmh_compression_enabled() &&
-         client->compress && pgp->size != 0 )
+    if ( client->live_migrating )
+        goto failed_dup; /* no dups allowed when migrating */
+    /* can we successfully manipulate pgp to change out the data? */
+    if ( len != 0 && client->compress && pgp->size != 0 )
     {
-        ret = do_tmem_put_compress(pgp,cmfn);
+        ret = do_tmem_put_compress(pgp,cmfn,cva);
         if ( ret == 1 )
             goto done;
         else if ( ret == 0 )
@@ -1072,7 +1168,7 @@ copy_uncompressed:
     if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
         goto failed_dup;
     /* tmh_copy_from_client properly handles len==0 and offsets != 0 */
-    ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
+    ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
     if ( ret == -EFAULT )
         goto bad_copy;
     pgp->size = 0;
@@ -1115,9 +1211,10 @@ failed_dup:
 }
 
 
-static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index,
+static NOINLINE int do_tmem_put(pool_t *pool,
+              uint64_t oid, uint32_t index,
               tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
-              uint32_t pfn_offset, uint32_t len)
+              uint32_t pfn_offset, uint32_t len, void *cva)
 {
     obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
     pgp_t *pgp = NULL, *pgpdel = NULL;
@@ -1131,7 +1228,7 @@ static NOINLINE int do_tmem_put(pool_t *
     {
         ASSERT_SPINLOCK(&objfound->obj_spinlock);
         if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
-            return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len);
+            return do_tmem_dup_put(pgp,cmfn,tmem_offset,pfn_offset,len,cva);
     }
 
     /* no puts allowed into a frozen pool (except dup puts) */
@@ -1162,10 +1259,10 @@ static NOINLINE int do_tmem_put(pool_t *
     ASSERT(ret != -EEXIST);
     pgp->index = index;
 
-    if ( len != 0 && tmh_compression_enabled() && client->compress )
+    if ( len != 0 && client->compress )
     {
         ASSERT(pgp->pfp == NULL);
-        ret = do_tmem_put_compress(pgp,cmfn);
+        ret = do_tmem_put_compress(pgp,cmfn,cva);
         if ( ret == 1 )
             goto insert_page;
         if ( ret == -ENOMEM )
@@ -1189,7 +1286,7 @@ copy_uncompressed:
         goto delete_and_free;
     }
     /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
-    ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len);
+    ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
     if ( ret == -EFAULT )
         goto bad_copy;
     pgp->size = 0;
@@ -1207,6 +1304,11 @@ insert_page:
         if (++client->eph_count > client->eph_count_max)
             client->eph_count_max = client->eph_count;
         tmem_spin_unlock(&eph_lists_spinlock);
+    } else { /* is_persistent */
+        tmem_spin_lock(&pers_lists_spinlock);
+        list_add_tail(&pgp->pool_pers_pages,
+            &pool->persistent_page_list);
+        tmem_spin_unlock(&pers_lists_spinlock);
     }
     ASSERT( ((objnew==obj)||(objfound==obj)) && (objnew!=objfound));
     if ( is_shared(pool) )
@@ -1249,7 +1351,7 @@ ASSERT(0);
 
 static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
               tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
-              uint32_t pfn_offset, uint32_t len)
+              uint32_t pfn_offset, uint32_t len, void *cva)
 {
     obj_t *obj;
     pgp_t *pgp;
@@ -1279,12 +1381,13 @@ static NOINLINE int do_tmem_get(pool_t *
     if ( pgp->size != 0 )
     {
         START_CYC_COUNTER(decompress);
-        if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size) == -EFAULT )
+        if ( tmh_decompress_to_client(cmfn, pgp->cdata,
+                                      pgp->size, cva) == -EFAULT )
             goto bad_copy;
         END_CYC_COUNTER(decompress);
     }
     else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
-                                 pfn_offset, len) == -EFAULT)
+                                 pfn_offset, len, cva) == -EFAULT)
         goto bad_copy;
     if ( is_ephemeral(pool) )
     {
@@ -1398,10 +1501,12 @@ static NOINLINE int do_tmem_destroy_pool
     return 1;
 }
 
-static NOINLINE int do_tmem_new_pool(uint32_t flags, uint64_t uuid_lo, 
uint64_t uuid_hi)
+static NOINLINE int do_tmem_new_pool(cli_id_t this_cli_id,
+                                     uint32_t this_pool_id, uint32_t flags,
+                                     uint64_t uuid_lo, uint64_t uuid_hi)
 {
-    client_t *client = tmh_client_from_current();
-    cli_id_t cli_id = tmh_get_cli_id_from_current();
+    client_t *client;
+    cli_id_t cli_id;
     int persistent = flags & TMEM_POOL_PERSIST;
     int shared = flags & TMEM_POOL_SHARED;
     int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
@@ -1410,12 +1515,22 @@ static NOINLINE int do_tmem_new_pool(uin
          & TMEM_POOL_VERSION_MASK;
     pool_t *pool, *shpool;
     int s_poolid, d_poolid, first_unused_s_poolid;
+    int i;
 
+    if ( this_cli_id == CLI_ID_NULL )
+    {
+        client = tmh_client_from_current();
+        cli_id = tmh_get_cli_id_from_current();
+    } else {
+        if ( (client = tmh_client_from_cli_id(this_cli_id)) == NULL)
+            return -EPERM;
+        cli_id = this_cli_id;
+    }
     ASSERT(client != NULL);
     printk("tmem: allocating %s-%s tmem pool for %s=%d...",
         persistent ? "persistent" : "ephemeral" ,
         shared ? "shared" : "private", cli_id_str, cli_id);
-    if ( specversion != 0 )
+    if ( specversion != TMEM_SPEC_VERSION )
     {
         printk("failed... unsupported spec version\n");
         return -EPERM;
@@ -1430,14 +1545,35 @@ static NOINLINE int do_tmem_new_pool(uin
         printk("failed... out of memory\n");
         return -ENOMEM;
     }
-    for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
+    if ( this_cli_id != CLI_ID_NULL )
+    {
+        d_poolid = this_pool_id;
+        if ( client->pools[d_poolid] != NULL )
+            return -EPERM;
+        d_poolid = this_pool_id;
+    }
+    else for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
         if ( client->pools[d_poolid] == NULL )
             break;
-    if ( d_poolid == MAX_POOLS_PER_DOMAIN )
+    if ( d_poolid >= MAX_POOLS_PER_DOMAIN )
     {
         printk("failed... no more pool slots available for this %s\n",
             client_str);
         goto fail;
+    }
+    if ( shared )
+    {
+        if ( uuid_lo == -1L && uuid_hi == -1L )
+            shared = 0;
+        if ( client->shared_auth_required && !global_shared_auth )
+        {
+            for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
+                if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
+                     (client->shared_auth_uuid[i][1] == uuid_hi) )
+                    break;
+            if ( i == MAX_GLOBAL_SHARED_POOLS )
+                shared = 0;
+        }
     }
     pool->shared = shared;
     pool->client = client;
@@ -1491,7 +1627,7 @@ fail:
 /************ TMEM CONTROL OPERATIONS ************************************/
 
 /* freeze/thaw all pools belonging to client cli_id (all domains if -1) */
-static int tmemc_freeze_pools(int cli_id, int arg)
+static int tmemc_freeze_pools(cli_id_t cli_id, int arg)
 {
     client_t *client;
     bool_t freeze = (arg == TMEMC_FREEZE) ? 1 : 0;
@@ -1502,20 +1638,20 @@ static int tmemc_freeze_pools(int cli_id
     if ( cli_id == CLI_ID_NULL )
     {
         list_for_each_entry(client,&global_client_list,client_list)
-            client->frozen = freeze;
+            client_freeze(client,freeze);
         printk("tmem: all pools %s for all %ss\n",s,client_str);
     }
     else
     {
         if ( (client = tmh_client_from_cli_id(cli_id)) == NULL)
             return -1;
-        client->frozen = freeze;
+        client_freeze(client,freeze);
         printk("tmem: all pools %s for %s=%d\n",s,cli_id_str,cli_id);
     }
     return 0;
 }
 
-static int tmemc_flush_mem(int cli_id, uint32_t kb)
+static int tmemc_flush_mem(cli_id_t cli_id, uint32_t kb)
 {
     uint32_t npages, flushed_pages, flushed_kb;
 
@@ -1699,7 +1835,7 @@ static int tmemc_list_global(tmem_cli_va
     return sum;
 }
 
-static int tmemc_list(int cli_id, tmem_cli_va_t buf, uint32_t len,
+static int tmemc_list(cli_id_t cli_id, tmem_cli_va_t buf, uint32_t len,
                                bool_t use_long)
 {
     client_t *client;
@@ -1716,7 +1852,6 @@ static int tmemc_list(int cli_id, tmem_c
         return -1;
     else
         off = tmemc_list_client(client, buf, 0, len, use_long);
-
 
     return 0;
 }
@@ -1740,6 +1875,9 @@ static int tmemc_set_var_one(client_t *c
         printk("tmem: cap set to %d for %s=%d\n",arg1,cli_id_str,cli_id);
         break;
     case TMEMC_SET_COMPRESS:
+#ifdef __i386__
+        return -1;
+#endif
         client->compress = arg1 ? 1 : 0;
         printk("tmem: compression %s for %s=%d\n",
             arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
@@ -1751,7 +1889,7 @@ static int tmemc_set_var_one(client_t *c
     return 0;
 }
 
-static int tmemc_set_var(int cli_id, uint32_t subop, uint32_t arg1)
+static int tmemc_set_var(cli_id_t cli_id, uint32_t subop, uint32_t arg1)
 {
     client_t *client;
 
@@ -1765,11 +1903,229 @@ static int tmemc_set_var(int cli_id, uin
     return 0;
 }
 
-static int do_tmem_control(uint32_t subop, uint32_t cli_id32,
-   uint32_t arg1, uint32_t arg2, tmem_cli_va_t buf)
+static NOINLINE int tmemc_shared_pool_auth(cli_id_t cli_id, uint64_t uuid_lo,
+                                  uint64_t uuid_hi, bool_t auth)
+{
+    client_t *client;
+    int i, free = -1;
+
+    if ( cli_id == CLI_ID_NULL )
+    {
+        global_shared_auth = auth;
+        return 1;
+    }
+    client = tmh_client_from_cli_id(cli_id);
+    for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
+    {
+        if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
+             (client->shared_auth_uuid[i][1] == uuid_hi) )
+        {
+            if ( auth == 0 )
+                client->shared_auth_uuid[i][0] =
+                    client->shared_auth_uuid[i][1] = -1L;
+            return 1;
+        }
+        if ( (auth == 1) && (client->shared_auth_uuid[i][0] == -1L) &&
+                 (client->shared_auth_uuid[i][1] == -1L) && (free == -1) )
+            free = i;
+    }
+    if ( auth == 0 )
+        return 0;
+    if ( auth == 1 && free == -1 )
+        return -ENOMEM;
+    client->shared_auth_uuid[free][0] = uuid_lo;
+    client->shared_auth_uuid[free][1] = uuid_hi;
+    return 1;
+}
+
+static NOINLINE int tmemc_save_subop(int cli_id, uint32_t pool_id,
+                        uint32_t subop, tmem_cli_va_t buf, uint32_t arg1)
+{
+    client_t *client = tmh_client_from_cli_id(cli_id);
+    pool_t *pool =  (client == NULL) ? NULL : client->pools[pool_id];
+    uint32_t p;
+    uint64_t *uuid;
+    pgp_t *pgp, *pgp2;
+
+    switch(subop)
+    {
+    case TMEMC_SAVE_BEGIN:
+        if ( client == NULL )
+            return 0;
+        for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
+            if ( client->pools[p] != NULL )
+                break;
+        if ( p == MAX_POOLS_PER_DOMAIN )
+            return 0;
+        client->was_frozen = client->frozen;
+        client->frozen = 1;
+        if ( arg1 != 0 )
+            client->live_migrating = 1;
+        return 1;
+    case TMEMC_RESTORE_BEGIN:
+        ASSERT(client == NULL);
+        if ( (client = client_create(cli_id)) == NULL )
+            return -1;
+        return 1;
+    case TMEMC_SAVE_GET_VERSION:
+        return TMEM_SPEC_VERSION;
+    case TMEMC_SAVE_GET_MAXPOOLS:
+        return MAX_POOLS_PER_DOMAIN;
+    case TMEMC_SAVE_GET_CLIENT_WEIGHT:
+        return client->weight == -1 ? -2 : client->weight;
+    case TMEMC_SAVE_GET_CLIENT_CAP:
+        return client->cap == -1 ? -2 : client->cap;
+    case TMEMC_SAVE_GET_CLIENT_FLAGS:
+        return (client->compress ? TMEM_CLIENT_COMPRESS : 0 ) |
+               (client->was_frozen ? TMEM_CLIENT_FROZEN : 0 );
+    case TMEMC_SAVE_GET_POOL_FLAGS:
+         if ( pool == NULL )
+             return -1;
+         return (pool->persistent ? TMEM_POOL_PERSIST : 0) |
+                (pool->shared ? TMEM_POOL_SHARED : 0) |
+                (pool->pageshift << TMEM_POOL_PAGESIZE_SHIFT);
+    case TMEMC_SAVE_GET_POOL_NPAGES:
+         if ( pool == NULL )
+             return -1;
+        return _atomic_read(pool->pgp_count);
+    case TMEMC_SAVE_GET_POOL_UUID:
+         if ( pool == NULL )
+             return -1;
+        uuid = (uint64_t *)buf.p;
+        *uuid++ = pool->uuid[0];
+        *uuid = pool->uuid[1];
+        return 0;
+    case TMEMC_SAVE_END:
+        client->live_migrating = 0;
+        if ( !list_empty(&client->persistent_invalidated_list) )
+            list_for_each_entry_safe(pgp,pgp2,
+              &client->persistent_invalidated_list, client_inv_pages)
+                pgp_free_from_inv_list(client,pgp);
+        client->frozen = client->was_frozen;
+        return 0;
+    }
+    return -1;
+}
+
+static NOINLINE int tmemc_save_get_next_page(int cli_id, int pool_id,
+                        tmem_cli_va_t buf, uint32_t bufsize)
+{
+    client_t *client = tmh_client_from_cli_id(cli_id);
+    pool_t *pool =  (client == NULL) ? NULL : client->pools[pool_id];
+    pgp_t *pgp;
+    int ret = 0;
+    struct tmem_handle *h;
+    unsigned int pagesize = 1 << (pool->pageshift+12);
+
+    if ( pool == NULL )
+        return -1;
+    if ( is_ephemeral(pool) )
+        return -1;
+    if ( bufsize < pagesize + sizeof(struct tmem_handle) )
+        return -ENOMEM;
+
+    tmem_spin_lock(&pers_lists_spinlock);
+    if ( list_empty(&pool->persistent_page_list) )
+    {
+        ret = -1;
+        goto out;
+    }
+    /* note: pool->cur_pgp is the pgp last returned by get_next_page */
+    if ( pool->cur_pgp == NULL )
+    {
+        /* process the first one */
+        pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
+                         pgp_t,pool_pers_pages);
+    } else if ( list_is_last(&pool->cur_pgp->pool_pers_pages, 
+                             &pool->persistent_page_list) )
+    {
+        /* already processed the last one in the list */
+        ret = -1;
+        goto out;
+    }
+    pgp = list_entry((&pool->cur_pgp->pool_pers_pages)->next,
+                         pgp_t,pool_pers_pages);
+    pool->cur_pgp = pgp;
+    h = (struct tmem_handle *)buf.p;
+    h->oid = pgp->obj->oid;
+    h->index = pgp->index;
+    buf.p = (void *)(h+1);
+    ret = do_tmem_get(pool, h->oid, h->index,0,0,0,pagesize,buf.p);
+
+out:
+    tmem_spin_unlock(&pers_lists_spinlock);
+    return ret;
+}
+
+static NOINLINE int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_t buf,
+                        uint32_t bufsize)
+{
+    client_t *client = tmh_client_from_cli_id(cli_id);
+    pgp_t *pgp;
+    struct tmem_handle *h;
+    int ret = 0;
+
+    if ( client == NULL )
+        return 0;
+    if ( bufsize < sizeof(struct tmem_handle) )
+        return 0;
+    tmem_spin_lock(&pers_lists_spinlock);
+    if ( list_empty(&client->persistent_invalidated_list) )
+        goto out;
+    if ( client->cur_pgp == NULL )
+    {
+        pgp = list_entry((&client->persistent_invalidated_list)->next,
+                         pgp_t,client_inv_pages);
+        client->cur_pgp = pgp;
+    } else if ( list_is_last(&client->cur_pgp->client_inv_pages, 
+                             &client->persistent_invalidated_list) )
+    {
+        client->cur_pgp = NULL;
+        ret = 0;
+        goto out;
+    } else {
+        pgp = list_entry((&client->cur_pgp->client_inv_pages)->next,
+                         pgp_t,client_inv_pages);
+        client->cur_pgp = pgp;
+    }
+    h = (struct tmem_handle *)buf.p;
+    h->pool_id = pgp->pool_id;
+    h->oid = pgp->inv_oid;
+    h->index = pgp->index;
+    ret = 1;
+out:
+    tmem_spin_unlock(&pers_lists_spinlock);
+    return ret;
+}
+
+static int tmemc_restore_put_page(int cli_id, int pool_id, uint64_t oid,
+                      uint32_t index, tmem_cli_va_t buf, uint32_t bufsize)
+{
+    client_t *client = tmh_client_from_cli_id(cli_id);
+    pool_t *pool =  (client == NULL) ? NULL : client->pools[pool_id];
+int ret = 0;
+
+    if ( pool == NULL )
+        return -1;
+    return do_tmem_put(pool,oid,index,0,0,0,bufsize,buf.p);
+}
+
+static int tmemc_restore_flush_page(int cli_id, int pool_id, uint64_t oid,
+                        uint32_t index)
+{
+    client_t *client = tmh_client_from_cli_id(cli_id);
+    pool_t *pool =  (client == NULL) ? NULL : client->pools[pool_id];
+
+    if ( pool == NULL )
+        return -1;
+    return do_tmem_flush_page(pool, oid, index);
+}
+
+static NOINLINE int do_tmem_control(struct tmem_op *op)
 {
     int ret;
-    cli_id_t cli_id = (cli_id_t)cli_id32;
+    uint32_t pool_id = op->pool_id;
+    uint32_t subop = op->u.ctrl.subop;
 
     if (!tmh_current_is_privileged())
     {
@@ -1781,18 +2137,50 @@ static int do_tmem_control(uint32_t subo
     case TMEMC_THAW:
     case TMEMC_FREEZE:
     case TMEMC_DESTROY:
-        ret = tmemc_freeze_pools(cli_id,subop);
+        ret = tmemc_freeze_pools(op->u.ctrl.cli_id,subop);
         break;
     case TMEMC_FLUSH:
-        ret = tmemc_flush_mem(cli_id,arg1);
+        ret = tmemc_flush_mem(op->u.ctrl.cli_id,op->u.ctrl.arg1);
         break;
     case TMEMC_LIST:
-        ret = tmemc_list(cli_id,buf,arg1,arg2);
+        ret = tmemc_list(op->u.ctrl.cli_id,op->u.ctrl.buf,
+                         op->u.ctrl.arg1,op->u.ctrl.arg2);
         break;
     case TMEMC_SET_WEIGHT:
     case TMEMC_SET_CAP:
     case TMEMC_SET_COMPRESS:
-        ret = tmemc_set_var(cli_id,subop,arg1);
+        ret = tmemc_set_var(op->u.ctrl.cli_id,subop,op->u.ctrl.arg1);
+        break;
+    case TMEMC_SAVE_BEGIN:
+    case TMEMC_RESTORE_BEGIN:
+    case TMEMC_SAVE_GET_VERSION:
+    case TMEMC_SAVE_GET_MAXPOOLS:
+    case TMEMC_SAVE_GET_CLIENT_WEIGHT:
+    case TMEMC_SAVE_GET_CLIENT_CAP:
+    case TMEMC_SAVE_GET_CLIENT_FLAGS:
+    case TMEMC_SAVE_GET_POOL_FLAGS:
+    case TMEMC_SAVE_GET_POOL_NPAGES:
+    case TMEMC_SAVE_GET_POOL_UUID:
+    case TMEMC_SAVE_END:
+        ret = tmemc_save_subop(op->u.ctrl.cli_id,pool_id,subop,
+                        op->u.ctrl.buf,op->u.ctrl.arg1);
+        break;
+    case TMEMC_SAVE_GET_NEXT_PAGE:
+        ret = tmemc_save_get_next_page(op->u.ctrl.cli_id, pool_id,
+                                       op->u.ctrl.buf, op->u.ctrl.arg1);
+        break;
+    case TMEMC_SAVE_GET_NEXT_INV:
+        ret = tmemc_save_get_next_inv(op->u.ctrl.cli_id, op->u.ctrl.buf,
+                                      op->u.ctrl.arg1);
+        break;
+    case TMEMC_RESTORE_PUT_PAGE:
+        ret = tmemc_restore_put_page(op->u.ctrl.cli_id,pool_id,
+                                     op->u.ctrl.arg3, op->u.ctrl.arg2,
+                                     op->u.ctrl.buf, op->u.ctrl.arg1);
+        break;
+    case TMEMC_RESTORE_FLUSH_PAGE:
+        ret = tmemc_restore_flush_page(op->u.ctrl.cli_id,pool_id,
+                                       op->u.ctrl.arg3, op->u.ctrl.arg2);
         break;
     default:
         ret = -1;
@@ -1850,8 +2238,19 @@ EXPORT long do_tmem_op(tmem_cli_op_t uop
     {
         tmem_write_lock(&tmem_rwlock);
         tmem_write_lock_set = 1;
-        rc = do_tmem_control(op.u.ctrl.subop, op.u.ctrl.cli_id,
-                             op.u.ctrl.arg1, op.u.ctrl.arg2, op.u.ctrl.buf);
+        rc = do_tmem_control(&op);
+        goto out;
+    } else if ( op.cmd == TMEM_AUTH ) {
+        tmem_write_lock(&tmem_rwlock);
+        tmem_write_lock_set = 1;
+        rc = tmemc_shared_pool_auth(op.u.new.arg1,op.u.new.uuid[0],
+                         op.u.new.uuid[1],op.u.new.flags);
+        goto out;
+    } else if ( op.cmd == TMEM_RESTORE_NEW ) {
+        tmem_write_lock(&tmem_rwlock);
+        tmem_write_lock_set = 1;
+        rc = do_tmem_new_pool(op.u.new.arg1, op.pool_id, op.u.new.flags,
+                         op.u.new.uuid[0], op.u.new.uuid[1]);
         goto out;
     }
 
@@ -1860,7 +2259,7 @@ EXPORT long do_tmem_op(tmem_cli_op_t uop
     {
         tmem_write_lock(&tmem_rwlock);
         tmem_write_lock_set = 1;
-        if ( (client = client_create()) == NULL )
+        if ( (client = client_create(tmh_get_cli_id_from_current())) == NULL )
         {
             printk("tmem: can't create tmem structure for %s\n",client_str);
             rc = -ENOMEM;
@@ -1896,22 +2295,22 @@ EXPORT long do_tmem_op(tmem_cli_op_t uop
     switch ( op.cmd )
     {
     case TMEM_NEW_POOL:
-        rc = do_tmem_new_pool(op.u.new.flags,
+        rc = do_tmem_new_pool(CLI_ID_NULL, 0, op.u.new.flags,
                               op.u.new.uuid[0], op.u.new.uuid[1]);
         break;
     case TMEM_NEW_PAGE:
-        rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
-                         0, 0, 0);
+        rc = do_tmem_put(pool, op.u.gen.object,
+                         op.u.gen.index, op.u.gen.cmfn, 0, 0, 0, NULL);
         break;
     case TMEM_PUT_PAGE:
-        rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
-                         0, 0, PAGE_SIZE);
+        rc = do_tmem_put(pool, op.u.gen.object,
+                    op.u.gen.index, op.u.gen.cmfn, 0, 0, PAGE_SIZE, NULL);
         if (rc == 1) succ_put = 1;
         else non_succ_put = 1;
         break;
     case TMEM_GET_PAGE:
         rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
-                         0, 0, PAGE_SIZE);
+                         0, 0, PAGE_SIZE, 0);
         if (rc == 1) succ_get = 1;
         else non_succ_get = 1;
         break;
@@ -1930,12 +2329,13 @@ EXPORT long do_tmem_op(tmem_cli_op_t uop
     case TMEM_READ:
         rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
                          op.u.gen.tmem_offset, op.u.gen.pfn_offset,
-                         op.u.gen.len);
+                         op.u.gen.len,0);
         break;
     case TMEM_WRITE:
-        rc = do_tmem_put(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
+        rc = do_tmem_put(pool, op.u.gen.object,
+                         op.u.gen.index, op.u.gen.cmfn,
                          op.u.gen.tmem_offset, op.u.gen.pfn_offset,
-                         op.u.gen.len);
+                         op.u.gen.len, NULL);
         break;
     case TMEM_XCHG:
         /* need to hold global lock to ensure xchg is atomic */
diff -r 5333e6497af6 xen/common/tmem_xen.c
--- a/xen/common/tmem_xen.c     Mon Jul 20 15:45:50 2009 +0100
+++ b/xen/common/tmem_xen.c     Wed Aug 05 11:17:18 2009 -0600
@@ -19,6 +19,9 @@ boolean_param("tmem", opt_tmem);
 
 EXPORT int opt_tmem_compress = 0;
 boolean_param("tmem_compress", opt_tmem_compress);
+
+EXPORT int opt_tmem_shared_auth = 0;
+boolean_param("tmem_shared_auth", opt_tmem_shared_auth);
 
 EXPORT int opt_tmem_lock = 0;
 integer_param("tmem_lock", opt_tmem_lock);
@@ -98,14 +101,14 @@ static inline void *cli_mfn_to_va(tmem_c
 
 EXPORT int tmh_copy_from_client(pfp_t *pfp,
     tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
-    uint32_t pfn_offset, uint32_t len)
+    uint32_t pfn_offset, uint32_t len, void *cli_va)
 {
     unsigned long tmem_mfn;
-    void *tmem_va, *cli_va = NULL;
+    void *tmem_va;
 
     ASSERT(pfp != NULL);
     if ( tmem_offset || pfn_offset || len )
-        if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
+        if ( (cli_va == NULL) && ((cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL) 
)
             return -EFAULT;
     tmem_mfn = page_to_mfn(pfp);
     tmem_va = map_domain_page(tmem_mfn);
@@ -123,14 +126,13 @@ EXPORT int tmh_copy_from_client(pfp_t *p
 }
 
 EXPORT int tmh_compress_from_client(tmem_cli_mfn_t cmfn,
-    void **out_va, size_t *out_len)
+    void **out_va, size_t *out_len, void *cli_va)
 {
-    void *cli_va;
     int ret = 0;
     unsigned char *dmem = this_cpu(dstmem);
     unsigned char *wmem = this_cpu(workmem);
 
-    if ( (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
+    if ( (cli_va == NULL) && (cli_va = cli_mfn_to_va(cmfn,NULL)) == NULL)
         return -EFAULT;
     if ( dmem == NULL || wmem == NULL )
         return 0;  /* no buffer, so can't compress */
@@ -143,13 +145,16 @@ EXPORT int tmh_compress_from_client(tmem
 }
 
 EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
-    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len)
+    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cli_va)
 {
-    unsigned long tmem_mfn, cli_mfn;
-    void *tmem_va, *cli_va;
+    unsigned long tmem_mfn, cli_mfn = 0;
+    int mark_dirty = 1;
+    void *tmem_va;
 
     ASSERT(pfp != NULL);
-    if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+    if ( cli_va != NULL )
+        mark_dirty = 0;
+    else if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
         return -EFAULT;
     tmem_mfn = page_to_mfn(pfp);
     tmem_va = map_domain_page(tmem_mfn);
@@ -158,26 +163,35 @@ EXPORT int tmh_copy_to_client(tmem_cli_m
     else if ( (tmem_offset+len <= PAGE_SIZE) && (pfn_offset+len <= PAGE_SIZE) )
         memcpy((char *)cli_va+pfn_offset,(char *)tmem_va+tmem_offset,len);
     unmap_domain_page(tmem_va);
-    unmap_domain_page(cli_va);
-    paging_mark_dirty(current->domain,cli_mfn);
+    if ( mark_dirty )
+    {
+        unmap_domain_page(cli_va);
+        paging_mark_dirty(current->domain,cli_mfn);
+    }
     mb();
     return 1;
 }
 
-EXPORT int tmh_decompress_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, size_t 
size)
+EXPORT int tmh_decompress_to_client(tmem_cli_mfn_t cmfn, void *tmem_va,
+                                    size_t size, void *cli_va)
 {
-    unsigned long cli_mfn;
-    void *cli_va;
+    unsigned long cli_mfn = 0;
+    int mark_dirty = 1;
     size_t out_len = PAGE_SIZE;
     int ret;
 
-    if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+    if ( cli_va != NULL )
+        mark_dirty = 0;
+    else if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
         return -EFAULT;
     ret = lzo1x_decompress_safe(tmem_va, size, cli_va, &out_len);
     ASSERT(ret == LZO_E_OK);
     ASSERT(out_len == PAGE_SIZE);
-    unmap_domain_page(cli_va);
-    paging_mark_dirty(current->domain,cli_mfn);
+    if ( mark_dirty )
+    {
+        unmap_domain_page(cli_va);
+        paging_mark_dirty(current->domain,cli_mfn);
+    }
     mb();
     return 1;
 }
diff -r 5333e6497af6 xen/include/public/tmem.h
--- a/xen/include/public/tmem.h Mon Jul 20 15:45:50 2009 +0100
+++ b/xen/include/public/tmem.h Wed Aug 05 11:17:18 2009 -0600
@@ -42,15 +42,36 @@
 #define TMEM_WRITE                 9
 #define TMEM_XCHG                 10
 
+/* Privileged commands to HYPERVISOR_tmem_op() */
+#define TMEM_AUTH                 101 
+#define TMEM_RESTORE_NEW          102
+
 /* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
-#define TMEMC_THAW                 0
-#define TMEMC_FREEZE               1
-#define TMEMC_FLUSH                2
-#define TMEMC_DESTROY              3
-#define TMEMC_LIST                 4
-#define TMEMC_SET_WEIGHT           5
-#define TMEMC_SET_CAP              6
-#define TMEMC_SET_COMPRESS         7
+#define TMEMC_THAW                   0
+#define TMEMC_FREEZE                 1
+#define TMEMC_FLUSH                  2
+#define TMEMC_DESTROY                3
+#define TMEMC_LIST                   4
+#define TMEMC_SET_WEIGHT             5
+#define TMEMC_SET_CAP                6
+#define TMEMC_SET_COMPRESS           7
+#define TMEMC_SHARED_POOL_AUTH       8
+#define TMEMC_SHARED_POOL_DEAUTH     9
+#define TMEMC_SAVE_BEGIN             10
+#define TMEMC_SAVE_GET_VERSION       11
+#define TMEMC_SAVE_GET_MAXPOOLS      12
+#define TMEMC_SAVE_GET_CLIENT_WEIGHT 13
+#define TMEMC_SAVE_GET_CLIENT_CAP    14
+#define TMEMC_SAVE_GET_CLIENT_FLAGS  15
+#define TMEMC_SAVE_GET_POOL_FLAGS    16
+#define TMEMC_SAVE_GET_POOL_NPAGES   17
+#define TMEMC_SAVE_GET_POOL_UUID     18
+#define TMEMC_SAVE_GET_NEXT_PAGE     19
+#define TMEMC_SAVE_GET_NEXT_INV      20
+#define TMEMC_SAVE_END               21
+#define TMEMC_RESTORE_BEGIN          30
+#define TMEMC_RESTORE_PUT_PAGE       32
+#define TMEMC_RESTORE_FLUSH_PAGE     33
 
 /* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
 #define TMEM_POOL_PERSIST          1
@@ -60,6 +81,10 @@
 #define TMEM_POOL_VERSION_SHIFT   24
 #define TMEM_POOL_VERSION_MASK  0xff
 
+/* Bits for client flags (save/restore) */
+#define TMEM_CLIENT_COMPRESS       1
+#define TMEM_CLIENT_FROZEN         2
+
 /* Special errno values */
 #define EFROZEN                 1000
 #define EEMPTY                  1001
@@ -70,31 +95,40 @@ typedef XEN_GUEST_HANDLE(char) tmem_cli_
 typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t;
 struct tmem_op {
     uint32_t cmd;
-    int32_t pool_id; /* private > 0; shared < 0; 0 is invalid */
+    int32_t pool_id;
     union {
-        struct {  /* for cmd == TMEM_NEW_POOL */
+        struct {
             uint64_t uuid[2];
             uint32_t flags;
-        } new;
-        struct {  /* for cmd == TMEM_CONTROL */
+            uint32_t arg1;
+        } new; /* for cmd == TMEM_NEW_POOL, TMEM_AUTH, TMEM_RESTORE_NEW */
+        struct { 
             uint32_t subop;
             uint32_t cli_id;
             uint32_t arg1;
             uint32_t arg2;
+            uint64_t arg3;
             tmem_cli_va_t buf;
-        } ctrl;
+        } ctrl; /* for cmd == TMEM_CONTROL */
         struct {
+            
             uint64_t object;
             uint32_t index;
             uint32_t tmem_offset;
             uint32_t pfn_offset;
             uint32_t len;
             tmem_cli_mfn_t cmfn; /* client machine page frame */
-        } gen;
+        } gen; /* for all other cmd ("generic") */
     } u;
 };
 typedef struct tmem_op tmem_op_t;
 DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
+
+struct tmem_handle {
+    uint32_t pool_id;
+    uint32_t index;
+    uint64_t oid;
+};
 
 #endif
 
diff -r 5333e6497af6 xen/include/xen/tmem_xen.h
--- a/xen/include/xen/tmem_xen.h        Mon Jul 20 15:45:50 2009 +0100
+++ b/xen/include/xen/tmem_xen.h        Wed Aug 05 11:17:18 2009 -0600
@@ -53,6 +53,12 @@ static inline int tmh_compression_enable
 static inline int tmh_compression_enabled(void)
 {
     return opt_tmem_compress;
+}
+
+extern int opt_tmem_shared_auth;
+static inline int tmh_shared_auth(void)
+{
+    return opt_tmem_shared_auth;
 }
 
 extern int opt_tmem;
@@ -271,9 +277,10 @@ static inline tmh_cli_ptr_t *tmh_get_cli
     return current->domain;
 }
 
-static inline void tmh_set_current_client(struct client *client)
+static inline void tmh_set_client_from_id(struct client *client,cli_id_t 
cli_id)
 {
-    current->domain->tmem = client;
+    struct domain *d = get_domain_by_id(cli_id);
+    d->tmem = client;
 }
 
 static inline bool_t tmh_current_is_privileged(void)
@@ -301,9 +308,11 @@ static inline int tmh_get_tmemop_from_cl
             return rc;
         switch ( cop.cmd )
         {
-        case TMEM_NEW_POOL: u = XLAT_tmem_op_u_new;  break;
-        case TMEM_CONTROL:  u = XLAT_tmem_op_u_ctrl; break;
-        default:            u = XLAT_tmem_op_u_gen;  break;
+        case TMEM_NEW_POOL:   u = XLAT_tmem_op_u_new;   break;
+        case TMEM_CONTROL:    u = XLAT_tmem_op_u_ctrl;  break;
+        case TMEM_AUTH:       u = XLAT_tmem_op_u_new;   break;
+        case TMEM_RESTORE_NEW:u = XLAT_tmem_op_u_new;   break;
+        default:              u = XLAT_tmem_op_u_gen ;  break;
         }
 #define XLAT_tmem_op_HNDL_u_ctrl_buf(_d_, _s_) \
         guest_from_compat_handle((_d_)->u.ctrl.buf, (_s_)->u.ctrl.buf)
@@ -326,16 +335,16 @@ static inline void tmh_copy_to_client_bu
 #define tmh_cli_id_str "domid"
 #define tmh_client_str "domain"
 
-extern int tmh_decompress_to_client(tmem_cli_mfn_t,void*,size_t);
+extern int tmh_decompress_to_client(tmem_cli_mfn_t,void*,size_t,void*);
 
-extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *);
+extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *,void*);
 
 extern int tmh_copy_from_client(pfp_t *pfp,
     tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
-    uint32_t pfn_offset, uint32_t len);
+    uint32_t pfn_offset, uint32_t len, void *cva);
 
 extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
-    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len);
+    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva);
 
 
 #define TMEM_PERF

Attachment: tmem-saverestore-090805.patch
Description: Binary data

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.