[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] [xen-unstable] tmem: add page deduplication with optional compression or trailing-zero-elimination



Add "page deduplication" capability (with optional compression
and trailing-zero elimination) to Xen's tmem.

(Transparent to tmem-enabled guests.)  Ephemeral pages
that have the exact same content are "combined" so that only
one page frame is needed.  Since ephemeral pages are essentially
read-only, no C-O-W (and thus no equivalent of swapping) is
necessary.  Deduplication can be combined with compression
or "trailing zero elimination" for even more space savings.

No non-tmem code is affected.

Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>

Points of interest:
- Modifications to LRU eviction algorithm to accommodate
  dedup'ed pages
- New data structures to allow lookup of matching pages
  and track references. (Algorithm used is similar to
  that used by KSM in KVM/Linux: No hashing required.)
- Lock (and rbtree) chosen by first byte of data to
  allow reasonably high concurrency without greatly
  complicating lock management.
- Statistics added so "dedup ratio" can be monitored.
- Dedup is disabled/enabled by Xen command line option
  and must be combined with the tmem Xen option.

Trailing zero elimination ("tze") saves significant tmem
RAM when many data files are less than 1-3 pages and remaining
(unused) space at the end of a page is zero-filled on disk
or in memory, as may be the case for example when VMs are
serving a large number of (deduplicate'able) web pages.

Compression already was a tmem option; v2 of this patch
combines compression with deduplication to further
improve tmem RAM utilization.

Either option (tmem_tze or tmem_compress) can be enabled
at xen boot time in addition to deduplication (tmem_dedup)
but compression overrides/disables tze.  Both have a
significant CPU cost so are useful primarily when
memory is more constrained than CPU cycles, for example
on a many-core machine with many low CPU-utilization
RAM-needy VMs.

tools/misc/xen-tmem-list-parse.c |   30 ++
 xen/common/tmem.c                |  474 ++++++++++++++++++++++++++++++++-------
 xen/common/tmem_xen.c            |   33 ++
 xen/include/xen/tmem_xen.h       |  122 +++++++++-
 4 files changed, 576 insertions(+), 83 deletions(-)

diff -r b8d2a4134a68 tools/misc/xen-tmem-list-parse.c
--- a/tools/misc/xen-tmem-list-parse.c  Wed Mar 03 17:41:58 2010 +0000
+++ b/tools/misc/xen-tmem-list-parse.c  Mon Apr 05 16:09:58 2010 -0600
@@ -110,13 +110,39 @@ void parse_global(char *s)
     unsigned long long rtree_node_max = parse(s,"Nm");
     unsigned long long pgp_count = parse(s,"Pc");
     unsigned long long pgp_max = parse(s,"Pm");
+    unsigned long long page_count = parse(s,"Fc");
+    unsigned long long max_page_count = parse(s,"Fm");
+    unsigned long long pcd_count = parse(s,"Sc");
+    unsigned long long max_pcd_count = parse(s,"Sm");
+    unsigned long long pcd_tot_tze_size = parse(s,"Zt");
+    unsigned long long pcd_tot_csize = parse(s,"Gz");
+    unsigned long long deduped_puts = parse(s,"Gd");
+    unsigned long long tot_good_eph_puts = parse(s,"Ep");
 
     printf("total tmem ops=%llu (errors=%llu) -- tmem pages avail=%llu\n",
            total_ops, errored_ops, avail_pages);
     printf("datastructs: objs=%llu (max=%llu) pgps=%llu (max=%llu) "
-           "nodes=%llu (max=%llu)\n",
+           "nodes=%llu (max=%llu) pages=%llu (max=%llu) ",
            obj_count, obj_max, pgp_count, pgp_max,
-           rtree_node_count, rtree_node_max);
+           rtree_node_count, rtree_node_max,
+           page_count,max_page_count);
+    if (max_pcd_count != 0 && global_eph_count != 0 && tot_good_eph_puts != 0) 
{
+           printf("pcds=%llu (max=%llu) ",
+               pcd_count,max_pcd_count);
+           printf("deduped: avg=%4.2f%% (curr=%4.2f%%) ",
+                   ((deduped_puts*1.0)/tot_good_eph_puts)*100,
+                   (1.0-(pcd_count*1.0)/global_eph_count)*100);
+    }
+    if (pcd_count != 0)
+    {
+           if (pcd_tot_tze_size && (pcd_tot_tze_size < pcd_count*PAGE_SIZE))
+               printf("tze savings=%4.2f%% ",
+                   (1.0-(pcd_tot_tze_size*1.0)/(pcd_count*PAGE_SIZE))*100);
+           if (pcd_tot_csize && (pcd_tot_csize < pcd_count*PAGE_SIZE))
+               printf("compression savings=%4.2f%% ",
+                   (1.0-(pcd_tot_csize*1.0)/(pcd_count*PAGE_SIZE))*100);
+    }
+    printf("\n");
     printf("misc: failed_copies=%llu alloc_failed=%llu alloc_page_failed=%llu "
            "low_mem=%llu evicted=%llu/%llu relinq=%llu/%llu, "
            "max_evicts_per_relinq=%llu, flush_pools=%llu, "
diff -r b8d2a4134a68 xen/common/tmem.c
--- a/xen/common/tmem.c Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/common/tmem.c Mon Apr 05 16:09:58 2010 -0600
@@ -6,11 +6,10 @@
  * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
  */
 
-/* TODO list: 090129
-   - improve on reclamation policy
+/* TODO list: 090129 (updated 100318)
+   - any better reclamation policy?
    - use different tlsf pools for each client (maybe each pool)
-   - implement page accounting and minimal QoS limits
-   - test shared access more completely (need pv cluster fs)
+   - test shared access more completely (ocfs2)
    - add feedback-driven compression (not for persistent pools though!)
    - add data-structure total bytes overhead stats
  */
@@ -77,12 +76,17 @@ static unsigned long relinq_pgs = 0, rel
 static unsigned long relinq_pgs = 0, relinq_attempts = 0;
 static unsigned long max_evicts_per_relinq = 0;
 static unsigned long low_on_memory = 0;
+static unsigned long deduped_puts = 0;
+static unsigned long tot_good_eph_puts = 0;
 static int global_obj_count_max = 0;
 static int global_pgp_count_max = 0;
+static int global_pcd_count_max = 0;
 static int global_page_count_max = 0;
 static int global_rtree_node_count_max = 0;
 static long global_eph_count_max = 0;
 static unsigned long failed_copies;
+static unsigned long pcd_tot_tze_size = 0;
+static unsigned long pcd_tot_csize = 0;
 
 DECL_CYC_COUNTER(succ_get);
 DECL_CYC_COUNTER(succ_put);
@@ -108,6 +112,7 @@ DECL_CYC_COUNTER(decompress);
 
 struct tm_pool;
 struct tmem_page_descriptor;
+struct tmem_page_content_descriptor;
 struct client {
     struct list_head client_list;
     struct tm_pool *pools[MAX_POOLS_PER_DOMAIN];
@@ -219,12 +224,17 @@ struct tmem_page_descriptor {
         obj_t *obj;
         uint64_t inv_oid;  /* used for invalid list only */
     };
+    pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
+                    else compressed data (cdata) */
     uint32_t index;
-    size_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
-                    else compressed data (cdata) */
+    /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings */
+    uint16_t firstbyte; /* NON_SHAREABLE->pfp  otherwise->pcd */
+    bool_t eviction_attempted;  /* CHANGE TO lifetimes? (settable) */
+    struct list_head pcd_siblings;
     union {
         pfp_t *pfp;  /* page frame pointer */
         char *cdata; /* compressed data */
+        struct tmem_page_content_descriptor *pcd; /* page dedup */
     };
     union {
         uint64_t timestamp;
@@ -233,6 +243,25 @@ struct tmem_page_descriptor {
     DECL_SENTINEL
 };
 typedef struct tmem_page_descriptor pgp_t;
+
+#define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64))
+
+struct tmem_page_content_descriptor {
+    union {
+        pfp_t *pfp;  /* page frame pointer */
+        char *cdata; /* if compression_enabled */
+        char *tze; /* if !compression_enabled, trailing zeroes eliminated */
+    };
+    struct list_head pgp_list;
+    struct rb_node pcd_rb_tree_node;
+    uint32_t pgp_ref_count;
+    pagesize_t size; /* if compression_enabled -> 0<size<PAGE_SIZE (*cdata)
+                     * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
+                     * else PAGE_SIZE -> *pfp */
+};
+typedef struct tmem_page_content_descriptor pcd_t;
+struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page */
+rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */
 
 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral pools 
*/
 
@@ -267,6 +296,7 @@ static long global_eph_count = 0; /* ato
 static long global_eph_count = 0; /* atomicity depends on eph_lists_spinlock */
 static atomic_t global_obj_count = ATOMIC_INIT(0);
 static atomic_t global_pgp_count = ATOMIC_INIT(0);
+static atomic_t global_pcd_count = ATOMIC_INIT(0);
 static atomic_t global_page_count = ATOMIC_INIT(0);
 static atomic_t global_rtree_node_count = ATOMIC_INIT(0);
 
@@ -336,6 +366,229 @@ static NOINLINE void tmem_page_free(pool
     atomic_dec_and_assert(global_page_count);
 }
 
+/************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/
+
+#define NOT_SHAREABLE ((uint16_t)-1UL)
+
+static NOINLINE int pcd_copy_to_client(tmem_cli_mfn_t cmfn, pgp_t *pgp)
+{
+    uint8_t firstbyte = pgp->firstbyte;
+    pcd_t *pcd;
+    int ret;
+
+    ASSERT(tmh_dedup_enabled());
+    tmem_read_lock(&pcd_tree_rwlocks[firstbyte]);
+    pcd = pgp->pcd;
+    if ( pgp->size < PAGE_SIZE && pgp->size != 0 &&
+         pcd->size < PAGE_SIZE && pcd->size != 0 )
+        ret = tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, NULL);
+    else if ( tmh_tze_enabled() && pcd->size < PAGE_SIZE )
+        ret = tmh_copy_tze_to_client(cmfn, pcd->tze, pcd->size);
+    else
+        ret = tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, NULL);
+    tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]);
+    return ret;
+}
+
+/* ensure pgp no longer points to pcd, nor vice-versa */
+/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done */
+static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t 
have_pcd_rwlock)
+{
+    pcd_t *pcd = pgp->pcd;
+    pfp_t *pfp = pgp->pcd->pfp;
+    uint16_t firstbyte = pgp->firstbyte;
+    char *pcd_tze = pgp->pcd->tze;
+    pagesize_t pcd_size = pcd->size;
+    pagesize_t pgp_size = pgp->size;
+    char *pcd_cdata = pgp->pcd->cdata;
+    pagesize_t pcd_csize = pgp->pcd->size;
+
+    ASSERT(tmh_dedup_enabled());
+    ASSERT(firstbyte != NOT_SHAREABLE);
+    ASSERT(firstbyte < 256);
+    
+    if ( have_pcd_rwlock )
+        ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]);
+    else
+        tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
+    list_del_init(&pgp->pcd_siblings);
+    pgp->pcd = NULL;
+    pgp->firstbyte = NOT_SHAREABLE;
+    pgp->size = -1;
+    if ( --pcd->pgp_ref_count )
+    {
+        tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+        return;
+    }
+
+    /* no more references to this pcd, recycle it and the physical page */
+    ASSERT(list_empty(&pcd->pgp_list));
+    pcd->pfp = NULL;
+    /* remove pcd from rbtree */
+    rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]);
+    /* reinit the struct for safety for now */
+    RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);
+    /* now free up the pcd memory */
+    tmem_free(pcd,sizeof(pcd_t),NULL);
+    atomic_dec_and_assert(global_pcd_count);
+    if ( pgp_size != 0 && pcd_size < PAGE_SIZE )
+    {
+        /* compressed data */
+        tmem_free(pcd_cdata,pcd_csize,pool);
+        pcd_tot_csize -= pcd_csize;
+    }
+    else if ( pcd_size != PAGE_SIZE )
+    {
+        /* trailing zero data */
+        pcd_tot_tze_size -= pcd_size;
+        if ( pcd_size )
+            tmem_free(pcd_tze,pcd_size,pool);
+    } else {
+        /* real physical page */
+        if ( tmh_tze_enabled() )
+            pcd_tot_tze_size -= PAGE_SIZE;
+        if ( tmh_compression_enabled() )
+            pcd_tot_csize -= PAGE_SIZE;
+        tmem_page_free(pool,pfp);
+    }
+    tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+}
+
+
+static NOINLINE int pcd_associate(pgp_t *pgp, char *cdata, pagesize_t csize)
+{
+    struct rb_node **new, *parent = NULL;
+    struct rb_root *root;
+    pcd_t *pcd;
+    int cmp;
+    pagesize_t pfp_size = 0;
+    uint8_t firstbyte = (cdata == NULL) ? tmh_get_first_byte(pgp->pfp) : 
*cdata;
+    int ret = 0;
+
+    if ( !tmh_dedup_enabled() )
+        return 0;
+    ASSERT(pgp->obj != NULL);
+    ASSERT(pgp->obj->pool != NULL);
+    ASSERT(!pgp->obj->pool->persistent);
+    if ( cdata == NULL )
+    {
+        ASSERT(pgp->pfp != NULL);
+        pfp_size = PAGE_SIZE;
+        if ( tmh_tze_enabled() )
+        {
+            pfp_size = tmh_tze_pfp_scan(pgp->pfp);
+            if ( pfp_size > PCD_TZE_MAX_SIZE )
+                pfp_size = PAGE_SIZE;
+        }
+        ASSERT(pfp_size <= PAGE_SIZE);
+        ASSERT(!(pfp_size & (sizeof(uint64_t)-1)));
+    }
+    tmem_write_lock(&pcd_tree_rwlocks[firstbyte]);
+
+    /* look for page match */
+    root = &pcd_tree_roots[firstbyte];
+    new = &(root->rb_node);
+    while ( *new )
+    {
+        pcd = container_of(*new, pcd_t, pcd_rb_tree_node);
+        parent = *new;
+        /* compare new entry and rbtree entry, set cmp accordingly */
+        if ( cdata != NULL )
+        {
+            if ( pcd->size < PAGE_SIZE )
+                /* both new entry and rbtree entry are compressed */
+                cmp = tmh_pcd_cmp(cdata,csize,pcd->cdata,pcd->size);
+            else
+                /* new entry is compressed, rbtree entry is not */
+                cmp = -1;
+        } else if ( pcd->size < PAGE_SIZE )
+            /* rbtree entry is compressed, rbtree entry is not */
+            cmp = 1;
+        else if ( tmh_tze_enabled() ) {
+            if ( pcd->size < PAGE_SIZE )
+                /* both new entry and rbtree entry are trailing zero */
+                cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->size);
+            else
+                /* new entry is trailing zero, rbtree entry is not */
+                cmp = tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SIZE);
+        } else  {
+            /* both new entry and rbtree entry are full physical pages */
+            ASSERT(pgp->pfp != NULL);
+            ASSERT(pcd->pfp != NULL);
+            cmp = tmh_page_cmp(pgp->pfp,pcd->pfp);
+        }
+
+        /* walk tree or match depending on cmp */
+        if ( cmp < 0 )
+            new = &((*new)->rb_left);
+        else if ( cmp > 0 )
+            new = &((*new)->rb_right);
+        else
+        {
+            /* match! if not compressed, free the no-longer-needed page */
+            /* but if compressed, data is assumed static so don't free! */
+            if ( cdata == NULL )
+                tmem_page_free(pgp->obj->pool,pgp->pfp);
+            deduped_puts++;
+            goto match;
+        }
+    }
+
+    /* exited while loop with no match, so alloc a pcd and put it in the tree 
*/
+    if ( (pcd = tmem_malloc(pcd_t, NULL)) == NULL )
+    {
+        ret = -ENOMEM;
+        goto unlock;
+    } else if ( cdata != NULL ) {
+        if ( (pcd->cdata = tmem_malloc_bytes(csize,pgp->obj->pool)) == NULL )
+        {
+            tmem_free(pcd,sizeof(pcd_t),NULL);
+            ret = -ENOMEM;
+            goto unlock;
+        }
+    }
+    atomic_inc_and_max(global_pcd_count);
+    RB_CLEAR_NODE(&pcd->pcd_rb_tree_node);  /* is this necessary */
+    INIT_LIST_HEAD(&pcd->pgp_list);  /* is this necessary */
+    pcd->pgp_ref_count = 0;
+    if ( cdata != NULL )
+    {
+        memcpy(pcd->cdata,cdata,csize);
+        pcd->size = csize;
+        pcd_tot_csize += csize;
+    } else if ( pfp_size == 0 ) {
+        ASSERT(tmh_tze_enabled());
+        pcd->size = 0;
+        pcd->tze = NULL;
+    } else if ( pfp_size < PAGE_SIZE &&
+         ((pcd->tze = tmem_malloc_bytes(pfp_size,pgp->obj->pool)) != NULL) ) {
+        tmh_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size);
+        pcd->size = pfp_size;
+        pcd_tot_tze_size += pfp_size;
+        tmem_page_free(pgp->obj->pool,pgp->pfp);
+    } else {
+        pcd->pfp = pgp->pfp;
+        pcd->size = PAGE_SIZE;
+        if ( tmh_tze_enabled() )
+            pcd_tot_tze_size += PAGE_SIZE;
+        if ( tmh_compression_enabled() )
+            pcd_tot_csize += PAGE_SIZE;
+    }
+    rb_link_node(&pcd->pcd_rb_tree_node, parent, new);
+    rb_insert_color(&pcd->pcd_rb_tree_node, root);
+
+match:
+    pcd->pgp_ref_count++;
+    list_add(&pgp->pcd_siblings,&pcd->pgp_list);
+    pgp->firstbyte = firstbyte;
+    pgp->eviction_attempted = 0;
+    pgp->pcd = pcd;
+
+unlock:
+    tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+    return ret;
+}
+
 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
 
 /* allocate a pgp_t and associate it with an object */
@@ -353,6 +606,12 @@ static NOINLINE pgp_t *pgp_alloc(obj_t *
     INIT_LIST_HEAD(&pgp->global_eph_pages);
     INIT_LIST_HEAD(&pgp->client_eph_pages);
     pgp->pfp = NULL;
+    if ( tmh_dedup_enabled() )
+    {
+        pgp->firstbyte = NOT_SHAREABLE;
+        pgp->eviction_attempted = 0;
+        INIT_LIST_HEAD(&pgp->pcd_siblings);
+    }
     pgp->size = -1;
     pgp->index = -1;
     pgp->timestamp = get_cycles();
@@ -374,18 +633,20 @@ static pgp_t *pgp_lookup_in_obj(obj_t *o
 
 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool)
 {
+    pagesize_t pgp_size = pgp->size;
+
     if ( pgp->pfp == NULL )
         return;
-    if ( !pgp->size )
+    if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
+        pcd_disassociate(pgp,pool,0); /* pgp->size lost */
+    else if ( pgp_size )
+        tmem_free(pgp->cdata,pgp_size,pool);
+    else
         tmem_page_free(pgp->obj->pool,pgp->pfp);
-    else
+    if ( pool != NULL && pgp_size )
     {
-        tmem_free(pgp->cdata,pgp->size,pool);
-        if ( pool != NULL )
-        {
-            pool->client->compressed_pages--;
-            pool->client->compressed_sum_size -= pgp->size;
-        }
+        pool->client->compressed_pages--;
+        pool->client->compressed_sum_size -= pgp_size;
     }
     pgp->pfp = NULL;
     pgp->size = -1;
@@ -987,10 +1248,56 @@ static void client_freeze(client_t *clie
 
 /************ MEMORY REVOCATION ROUTINES *******************************/
 
+static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock)
+{
+    obj_t *obj = pgp->obj;
+    pool_t *pool = obj->pool;
+    client_t *client = pool->client;
+    uint16_t firstbyte = pgp->firstbyte;
+
+    if ( pool->is_dying )
+        return 0;
+    if ( tmh_lock_all && !obj->no_evict )
+       return 1; 
+    if ( tmem_spin_trylock(&obj->obj_spinlock) )
+    {
+        if ( tmh_dedup_enabled() )
+        {
+            firstbyte = pgp->firstbyte;
+            if ( firstbyte ==  NOT_SHAREABLE )
+                goto obj_unlock;
+            ASSERT(firstbyte < 256);
+            if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) )
+                goto obj_unlock;
+            if ( pgp->pcd->pgp_ref_count > 1 && !pgp->eviction_attempted )
+            {
+                pgp->eviction_attempted++;
+                list_del(&pgp->global_eph_pages);
+                
list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
+                list_del(&pgp->client_eph_pages);
+                
list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
+                goto pcd_unlock;
+            }
+        }
+        if ( obj->pgp_count > 1 )
+            return 1;
+        if ( tmem_write_trylock(&pool->pool_rwlock) )
+        {
+            *hold_pool_rwlock = 1;
+            return 1;
+        }
+pcd_unlock:
+        tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]);
+obj_unlock:
+        tmem_spin_unlock(&obj->obj_spinlock);
+    }
+    return 0;
+}
+
 static int tmem_evict(void)
 {
     client_t *client = tmh_client_from_current();
-    pgp_t *pgp = NULL, *pgp_del;
+    pgp_t *pgp = NULL, *pgp2, *pgp_del;
     obj_t *obj;
     pool_t *pool;
     int ret = 0;
@@ -1001,49 +1308,15 @@ static int tmem_evict(void)
     if ( (client != NULL) && client_over_quota(client) &&
          !list_empty(&client->ephemeral_page_list) )
     {
-        list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pages)
-        {
-            obj = pgp->obj;
-            pool = obj->pool;
-            if ( pool->is_dying )
-                continue;
-            if ( tmh_lock_all && !obj->no_evict )
+        
list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,client_eph_pages)
+            if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
                 goto found;
-            if ( tmem_spin_trylock(&obj->obj_spinlock) )
-            {
-                if ( obj->pgp_count > 1 )
-                    goto found;
-                if ( tmem_write_trylock(&pool->pool_rwlock) )
-                {
-                    hold_pool_rwlock = 1;
-                    goto found;
-                }
-                tmem_spin_unlock(&obj->obj_spinlock);
-            }
-        }
     } else if ( list_empty(&global_ephemeral_page_list) ) {
         goto out;
     } else {
-        list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pages)
-        {
-            obj = pgp->obj;
-            pool = obj->pool;
-            if ( pool->is_dying )
-                continue;
-            if ( tmh_lock_all && !obj->no_evict )
+        
list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,global_eph_pages)
+            if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
                 goto found;
-            if ( tmem_spin_trylock(&obj->obj_spinlock) )
-            {
-                if ( obj->pgp_count > 1 )
-                    goto found;
-                if ( tmem_write_trylock(&pool->pool_rwlock) )
-                {
-                    hold_pool_rwlock = 1;
-                    goto found;
-                }
-                tmem_spin_unlock(&obj->obj_spinlock);
-            }
-        }
     }
 
     ret = 0;
@@ -1057,10 +1330,16 @@ found:
     ASSERT(obj->no_evict == 0);
     ASSERT(obj->pool != NULL);
     ASSERT_SENTINEL(obj,OBJ);
+    pool = obj->pool;
 
     ASSERT_SPINLOCK(&obj->obj_spinlock);
     pgp_del = pgp_delete_from_obj(obj, pgp->index);
     ASSERT(pgp_del == pgp);
+    if ( tmh_dedup_enabled() && pgp->firstbyte != NOT_SHAREABLE )
+    {
+        ASSERT(pgp->pcd->pgp_ref_count == 1 || pgp->eviction_attempted);
+        pcd_disassociate(pgp,pool,1);
+    }
     pgp_delete(pgp,1);
     if ( obj->pgp_count == 0 )
     {
@@ -1129,25 +1408,30 @@ static NOINLINE int do_tmem_put_compress
 #ifdef __i386__
     return -ENOMEM;
 #endif
+
     if ( pgp->pfp != NULL )
-        pgp_free_data(pgp, pgp->obj->pool);  /* FIXME... is this right? */
+        pgp_free_data(pgp, pgp->obj->pool);
     START_CYC_COUNTER(compress);
     ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
     if ( (ret == -EFAULT) || (ret == 0) )
         goto out;
-    else if ( (size == 0) || (size >= tmem_subpage_maxsize()) )
+    else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) {
         ret = 0;
-    else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL )
+        goto out;
+    } else if ( tmh_dedup_enabled() && !is_persistent(pgp->obj->pool) ) {
+        if ( (ret = pcd_associate(pgp,dst,size)) == -ENOMEM )
+            goto out;
+    } else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL ) {
         ret = -ENOMEM;
-    else
-    {
+        goto out;
+    } else {
         memcpy(p,dst,size);
         pgp->cdata = p;
-        pgp->size = size;
-        pgp->obj->pool->client->compressed_pages++;
-        pgp->obj->pool->client->compressed_sum_size += size;
-        ret = 1;
     }
+    pgp->size = size;
+    pgp->obj->pool->client->compressed_pages++;
+    pgp->obj->pool->client->compressed_sum_size += size;
+    ret = 1;
 
 out:
     END_CYC_COUNTER(compress);
@@ -1155,7 +1439,7 @@ out:
 }
 
 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn,
-       uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva)
+       pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void 
*cva)
 {
     pool_t *pool;
     obj_t *obj;
@@ -1197,6 +1481,11 @@ copy_uncompressed:
     ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,0);
     if ( ret == -EFAULT )
         goto bad_copy;
+    if ( tmh_dedup_enabled() && !is_persistent(pool) )
+    {
+        if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
+            goto failed_dup;
+    }
     pgp->size = 0;
 
 done:
@@ -1239,8 +1528,8 @@ failed_dup:
 
 static NOINLINE int do_tmem_put(pool_t *pool,
               uint64_t oid, uint32_t index,
-              tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
-              uint32_t pfn_offset, uint32_t len, void *cva)
+              tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
+              pagesize_t pfn_offset, pagesize_t len, void *cva)
 {
     obj_t *obj = NULL, *objfound = NULL, *objnew = NULL;
     pgp_t *pgp = NULL, *pgpdel = NULL;
@@ -1308,13 +1597,18 @@ copy_uncompressed:
 copy_uncompressed:
     if ( ( pgp->pfp = tmem_page_alloc(pool) ) == NULL )
     {
-        ret == -ENOMEM;
+        ret = -ENOMEM;
         goto delete_and_free;
     }
     /* tmh_copy_from_client properly handles len==0 (TMEM_NEW_PAGE) */
     ret = tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,cva);
     if ( ret == -EFAULT )
         goto bad_copy;
+    if ( tmh_dedup_enabled() && !is_persistent(pool) )
+    {
+        if ( pcd_associate(pgp,NULL,0) == -ENOMEM )
+            goto delete_and_free;
+    }
     pgp->size = 0;
 
 insert_page:
@@ -1344,6 +1638,8 @@ insert_page:
     pool->good_puts++;
     if ( is_persistent(pool) )
         client->succ_pers_puts++;
+    else
+        tot_good_eph_puts++;
     return 1;
 
 delete_and_free:
@@ -1376,8 +1672,8 @@ ASSERT(0);
 }
 
 static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
-              tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
-              uint32_t pfn_offset, uint32_t len, void *cva)
+              tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
+              pagesize_t pfn_offset, pagesize_t len, void *cva)
 {
     obj_t *obj;
     pgp_t *pgp;
@@ -1404,15 +1700,18 @@ static NOINLINE int do_tmem_get(pool_t *
         return 0;
     }
     ASSERT(pgp->size != -1);
-    if ( pgp->size != 0 )
+    if ( tmh_dedup_enabled() && !is_persistent(pool) &&
+              pgp->firstbyte != NOT_SHAREABLE )
     {
+        if ( pcd_copy_to_client(cmfn, pgp) == -EFAULT )
+            goto bad_copy;
+    } else if ( pgp->size != 0 ) {
         START_CYC_COUNTER(decompress);
         if ( tmh_decompress_to_client(cmfn, pgp->cdata,
                                       pgp->size, cva) == -EFAULT )
             goto bad_copy;
         END_CYC_COUNTER(decompress);
-    }
-    else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
+    } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset,
                                  pfn_offset, len, cva) == -EFAULT)
         goto bad_copy;
     if ( is_ephemeral(pool) )
@@ -1855,11 +2154,15 @@ static int tmemc_list_global(tmem_cli_va
       total_flush_pool, use_long ? ',' : '\n');
     if (use_long)
         n += scnprintf(info+n,BSIZE-n,
-          "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n",
+          "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d,"
+          "Fc:%d,Fm:%d,Sc:%d,Sm:%d,Ep:%lu,Gd:%lu,Zt:%lu,Gz:%lu\n",
           global_eph_count, global_eph_count_max,
           _atomic_read(global_obj_count), global_obj_count_max,
           _atomic_read(global_rtree_node_count), global_rtree_node_count_max,
-          _atomic_read(global_pgp_count), global_pgp_count_max);
+          _atomic_read(global_pgp_count), global_pgp_count_max,
+          _atomic_read(global_page_count), global_page_count_max,
+          _atomic_read(global_pcd_count), global_pcd_count_max,
+         tot_good_eph_puts,deduped_puts,pcd_tot_tze_size,pcd_tot_csize);
     if ( sum + n >= len )
         return sum;
     tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1);
@@ -1912,6 +2215,13 @@ static int tmemc_set_var_one(client_t *c
 #ifdef __i386__
         return -1;
 #endif
+        if ( tmh_dedup_enabled() )
+        {
+            printk("tmem: compression %s for all %ss, cannot be changed "
+                   "when tmem_dedup is enabled\n",
+            tmh_compression_enabled() ? "enabled" : "disabled",client_str);
+            return -1;
+        }
         client->compress = arg1 ? 1 : 0;
         printk("tmem: compression %s for %s=%d\n",
             arg1 ? "enabled" : "disabled",cli_id_str,cli_id);
@@ -2569,14 +2879,28 @@ EXPORT void *tmem_relinquish_pages(unsig
 /* called at hypervisor startup */
 EXPORT void init_tmem(void)
 {
+    int i;
     if ( !tmh_enabled() )
         return;
 
     radix_tree_init();
+    if ( tmh_dedup_enabled() )
+        for (i = 0; i < 256; i++ )
+        {
+            pcd_tree_roots[i] = RB_ROOT;
+            rwlock_init(&pcd_tree_rwlocks[i]);
+        }
+
     if ( tmh_init() )
     {
-        printk("tmem: initialized comp=%d global-lock=%d\n",
-            tmh_compression_enabled(), tmh_lock_all);
+        printk("tmem: initialized comp=%d dedup=%d tze=%d global-lock=%d\n",
+            tmh_compression_enabled(), tmh_dedup_enabled(), tmh_tze_enabled(),
+            tmh_lock_all);
+        if ( tmh_dedup_enabled()&&tmh_compression_enabled()&&tmh_tze_enabled() 
)
+        {
+            tmh_tze_disable();
+            printk("tmem: tze and compression not compatible, disabling 
tze\n");
+        }
         tmem_initialized = 1;
     }
     else
diff -r b8d2a4134a68 xen/common/tmem_xen.c
--- a/xen/common/tmem_xen.c     Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/common/tmem_xen.c     Mon Apr 05 16:09:58 2010 -0600
@@ -19,6 +19,12 @@ boolean_param("tmem", opt_tmem);
 
 EXPORT int opt_tmem_compress = 0;
 boolean_param("tmem_compress", opt_tmem_compress);
+
+EXPORT int opt_tmem_dedup = 0;
+boolean_param("tmem_dedup", opt_tmem_dedup);
+
+EXPORT int opt_tmem_tze = 0;
+boolean_param("tmem_tze", opt_tmem_tze);
 
 EXPORT int opt_tmem_shared_auth = 0;
 boolean_param("tmem_shared_auth", opt_tmem_shared_auth);
@@ -103,8 +109,8 @@ static inline void *cli_mfn_to_va(tmem_c
 #endif
 
 EXPORT int tmh_copy_from_client(pfp_t *pfp,
-    tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
-    uint32_t pfn_offset, uint32_t len, void *cli_va)
+    tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
+    pagesize_t pfn_offset, pagesize_t len, void *cli_va)
 {
     unsigned long tmem_mfn;
     void *tmem_va;
@@ -148,7 +154,7 @@ EXPORT int tmh_compress_from_client(tmem
 }
 
 EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
-    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cli_va)
+    pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void 
*cli_va)
 {
     unsigned long tmem_mfn, cli_mfn = 0;
     int mark_dirty = 1;
@@ -195,6 +201,27 @@ EXPORT int tmh_decompress_to_client(tmem
         unmap_domain_page(cli_va);
         paging_mark_dirty(current->domain,cli_mfn);
     }
+    mb();
+    return 1;
+}
+
+EXPORT int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va,
+                                    pagesize_t len)
+{
+    void *cli_va;
+    unsigned long cli_mfn;
+
+    ASSERT(!(len & (sizeof(uint64_t)-1)));
+    ASSERT(len <= PAGE_SIZE);
+    ASSERT(len > 0 || tmem_va == NULL);
+    if ( (cli_va = cli_mfn_to_va(cmfn,&cli_mfn)) == NULL)
+        return -EFAULT;
+    if ( len > 0 )
+        memcpy((char *)cli_va,(char *)tmem_va,len);
+    if ( len < PAGE_SIZE )
+        memset((char *)cli_va+len,0,PAGE_SIZE-len);
+    unmap_domain_page(cli_va);
+    paging_mark_dirty(current->domain,cli_mfn);
     mb();
     return 1;
 }
diff -r b8d2a4134a68 xen/include/xen/tmem_xen.h
--- a/xen/include/xen/tmem_xen.h        Wed Mar 03 17:41:58 2010 +0000
+++ b/xen/include/xen/tmem_xen.h        Mon Apr 05 16:09:58 2010 -0600
@@ -26,6 +26,8 @@ struct tmem_host_dependent_client {
 };
 typedef struct tmem_host_dependent_client tmh_client_t;
 
+typedef uint32_t pagesize_t;  /* like size_t, must handle largest PAGE_SIZE */
+
 #define IS_PAGE_ALIGNED(addr) \
   ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) == addr)
 #define IS_VALID_PAGE(_pi)  ( mfn_valid(page_to_mfn(_pi)) )
@@ -52,6 +54,23 @@ static inline int tmh_compression_enable
 static inline int tmh_compression_enabled(void)
 {
     return opt_tmem_compress;
+}
+
+extern int opt_tmem_dedup;
+static inline int tmh_dedup_enabled(void)
+{
+    return opt_tmem_dedup;
+}
+
+extern int opt_tmem_tze;
+static inline int tmh_tze_enabled(void)
+{
+    return opt_tmem_tze;
+}
+
+static inline void tmh_tze_disable(void)
+{
+    opt_tmem_tze = 0;
 }
 
 extern int opt_tmem_shared_auth;
@@ -326,6 +345,101 @@ static inline bool_t tmh_current_is_priv
     return IS_PRIV(current->domain);
 }
 
+static inline uint8_t tmh_get_first_byte(pfp_t *pfp)
+{
+    void *p = __map_domain_page(pfp);
+
+    return (uint8_t)(*(char *)p);
+}
+
+static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2)
+{
+    const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
+    const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp2);
+    int i;
+
+    // FIXME: code in assembly?
+ASSERT(p1 != NULL);
+ASSERT(p2 != NULL);
+    for ( i = PAGE_SIZE/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ );
+    if ( !i )
+        return 0;
+    if ( *p1 < *p2 )
+        return -1;
+    return 1;
+}
+
+static inline int tmh_pcd_cmp(void *va1, pagesize_t len1, void *va2, 
pagesize_t len2)
+{
+    const char *p1 = (char *)va1;
+    const char *p2 = (char *)va2;
+    pagesize_t i;
+
+    ASSERT(len1 <= PAGE_SIZE);
+    ASSERT(len2 <= PAGE_SIZE);
+    if ( len1 < len2 )
+        return -1;
+    if ( len1 > len2 )
+        return 1;
+    ASSERT(len1 == len2);
+    for ( i = len2; i && *p1 == *p2; i--, *p1++, *p2++ );
+    if ( !i )
+        return 0;
+    if ( *p1 < *p2 )
+        return -1;
+    return 1;
+}
+
+static inline int tmh_tze_pfp_cmp(pfp_t *pfp1, pagesize_t pfp_len, void *tva, 
pagesize_t tze_len)
+{
+    const uint64_t *p1 = (uint64_t *)__map_domain_page(pfp1);
+    const uint64_t *p2;
+    pagesize_t i;
+
+    if ( tze_len == PAGE_SIZE )
+       p2 = (uint64_t *)__map_domain_page((pfp_t *)tva);
+    else
+       p2 = (uint64_t *)tva;
+    ASSERT(pfp_len <= PAGE_SIZE);
+    ASSERT(!(pfp_len & (sizeof(uint64_t)-1)));
+    ASSERT(tze_len <= PAGE_SIZE);
+    ASSERT(!(tze_len & (sizeof(uint64_t)-1)));
+    if ( pfp_len < tze_len )
+        return -1;
+    if ( pfp_len > tze_len )
+        return 1;
+    ASSERT(pfp_len == tze_len);
+    for ( i = tze_len/sizeof(uint64_t); i && *p1 == *p2; i--, *p1++, *p2++ );
+    if ( !i )
+        return 0;
+    if ( *p1 < *p2 )
+        return -1;
+    return 1;
+}
+
+/* return the size of the data in the pfp, ignoring trailing zeroes and
+ * rounded up to the nearest multiple of 8 */
+static inline pagesize_t tmh_tze_pfp_scan(pfp_t *pfp)
+{
+    const uint64_t *p = (uint64_t *)__map_domain_page(pfp);
+    pagesize_t bytecount = PAGE_SIZE;
+    pagesize_t len = PAGE_SIZE/sizeof(uint64_t);
+    p += len;
+    while ( len-- && !*--p )
+        bytecount -= sizeof(uint64_t);
+    return bytecount;
+}
+
+static inline void tmh_tze_copy_from_pfp(void *tva, pfp_t *pfp, pagesize_t len)
+{
+    uint64_t *p1 = (uint64_t *)tva;
+    const uint64_t *p2 = (uint64_t *)__map_domain_page(pfp);
+
+    pagesize_t i;
+    ASSERT(!(len & (sizeof(uint64_t)-1)));
+    for ( i = len/sizeof(uint64_t); i--; *p1++ = *p2++);
+}
+
 /* these typedefs are in the public/tmem.h interface
 typedef XEN_GUEST_HANDLE(void) cli_mfn_t;
 typedef XEN_GUEST_HANDLE(char) cli_va_t;
@@ -378,11 +492,13 @@ extern int tmh_compress_from_client(tmem
 extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *,void*);
 
 extern int tmh_copy_from_client(pfp_t *pfp,
-    tmem_cli_mfn_t cmfn, uint32_t tmem_offset,
-    uint32_t pfn_offset, uint32_t len, void *cva);
+    tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
+    pagesize_t pfn_offset, pagesize_t len, void *cva);
 
 extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp,
-    uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva);
+    pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *cva);
+
+extern int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, 
pagesize_t len);
 
 
 #define TMEM_PERF

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.