[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC PATCH 3/4] tmem: preswap implementation (layered on tmem)



--- linux-2.6.30/mm/page_io.c   2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/mm/page_io.c      2009-06-19 09:33:59.000000000 -0600
@@ -102,6 +102,12 @@
                unlock_page(page);
                goto out;
        }
+       if (preswap_put(page) == 1) {
+               set_page_writeback(page);
+               unlock_page(page);
+               end_page_writeback(page);
+               goto out;
+       }
        bio = get_swap_bio(GFP_NOIO, page_private(page), page,
                                end_swap_bio_write);
        if (bio == NULL) {
@@ -134,6 +140,12 @@
                ret = -ENOMEM;
                goto out;
        }
+       if (preswap_get(page) == 1) {
+               SetPageUptodate(page);
+               unlock_page(page);
+               bio_put(bio);
+               goto out;
+       }
        count_vm_event(PSWPIN);
        submit_bio(READ, bio);
 out:
--- linux-2.6.30/mm/swapfile.c  2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/mm/swapfile.c     2009-06-19 16:20:14.000000000 -0600
@@ -35,7 +35,7 @@
 #include <linux/swapops.h>
 #include <linux/page_cgroup.h>
 
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
@@ -47,7 +47,7 @@
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
 
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
 
 static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
@@ -488,6 +488,7 @@
                                swap_list.next = p - swap_info;
                        nr_swap_pages++;
                        p->inuse_pages--;
+                       preswap_flush(p - swap_info, offset);
                        mem_cgroup_uncharge_swap(ent);
                }
        }
@@ -864,7 +865,7 @@
  * Recycle to start on reaching the end, returning 0 when empty.
  */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-                                       unsigned int prev)
+                               unsigned int prev, unsigned int preswap)
 {
        unsigned int max = si->max;
        unsigned int i = prev;
@@ -890,6 +891,12 @@
                        prev = 0;
                        i = 1;
                }
+               if (preswap) {
+                       if (preswap_test(si, i))
+                               break;
+                       else
+                               continue;
+               }
                count = si->swap_map[i];
                if (count && count != SWAP_MAP_BAD)
                        break;
@@ -901,8 +908,12 @@
  * We completely avoid races by reading each swap page in advance,
  * and then search for the process using it.  All the necessary
  * page table adjustments can then be made atomically.
+ *
+ * if the boolean preswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages
  */
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, unsigned int preswap,
+               unsigned long pages_to_unuse)
 {
        struct swap_info_struct * si = &swap_info[type];
        struct mm_struct *start_mm;
@@ -938,7 +949,7 @@
         * one pass through swap_map is enough, but not necessarily:
         * there are races when an instance of an entry might be missed.
         */
-       while ((i = find_next_to_unuse(si, i)) != 0) {
+       while ((i = find_next_to_unuse(si, i, preswap)) != 0) {
                if (signal_pending(current)) {
                        retval = -EINTR;
                        break;
@@ -1124,6 +1135,8 @@
                 * interactive performance.
                 */
                cond_resched();
+               if (preswap && pages_to_unuse && !--pages_to_unuse)
+                       break;
        }
 
        mmput(start_mm);
@@ -1448,7 +1461,7 @@
        spin_unlock(&swap_lock);
 
        current->flags |= PF_SWAPOFF;
-       err = try_to_unuse(type);
+       err = try_to_unuse(type, 0, 0);
        current->flags &= ~PF_SWAPOFF;
 
        if (err) {
@@ -1497,9 +1510,14 @@
        swap_map = p->swap_map;
        p->swap_map = NULL;
        p->flags = 0;
+       preswap_flush_area(p - swap_info);
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
+#ifdef CONFIG_PRESWAP
+       if (p->preswap_map)
+               vfree(p->preswap_map);
+#endif
        /* Destroy swap account informatin */
        swap_cgroup_swapoff(type);
 
@@ -1812,6 +1830,11 @@
        }
 
        memset(swap_map, 0, maxpages * sizeof(short));
+#ifdef CONFIG_PRESWAP
+       p->preswap_map = vmalloc(maxpages / sizeof(long));
+       if (p->preswap_map)
+               memset(p->preswap_map, 0, maxpages / sizeof(long));
+#endif
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                int page_nr = swap_header->info.badpages[i];
                if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
@@ -1886,6 +1909,7 @@
        } else {
                swap_info[prev].next = p - swap_info;
        }
+       preswap_init(p - swap_info);
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        error = 0;
@@ -2002,6 +2026,8 @@
 
        si = &swap_info[swp_type(entry)];
        target = swp_offset(entry);
+       if (preswap_test(si, target))
+               return 0;
        base = (target >> our_page_cluster) << our_page_cluster;
        end = base + (1 << our_page_cluster);
        if (!base)              /* first page is swap header */
@@ -2018,6 +2044,9 @@
                        break;
                if (si->swap_map[toff] == SWAP_MAP_BAD)
                        break;
+               /* Don't read in preswap pages */
+               if (preswap_test(si, toff))
+                       break;
        }
        /* Count contiguous allocated slots below our target */
        for (toff = target; --toff >= base; nr_pages++) {
--- linux-2.6.30/include/linux/swap.h   2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/include/linux/swap.h      2009-06-19 12:51:55.000000000 
-0600
@@ -8,6 +8,7 @@
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
 #include <linux/node.h>
+#include <linux/vmalloc.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -154,8 +155,62 @@
        unsigned int max;
        unsigned int inuse_pages;
        unsigned int old_block_size;
+#ifdef CONFIG_PRESWAP
+       unsigned long *preswap_map;
+       unsigned int preswap_pages;
+#endif
 };
 
+#ifdef CONFIG_PRESWAP
+
+#include <linux/sysctl.h>
+extern int preswap_sysctl_handler(struct ctl_table *, int, struct file *,
+       void __user *, size_t *, loff_t *);
+extern const unsigned long preswap_zero, preswap_infinity;
+
+extern void preswap_shrink(unsigned long);
+extern int preswap_test(struct swap_info_struct *, unsigned long);
+extern void preswap_init(unsigned);
+extern int preswap_put(struct page *);
+extern int preswap_get(struct page *);
+extern void preswap_flush(unsigned, unsigned long);
+extern void preswap_flush_area(unsigned);
+/* in swapfile.c */
+extern int try_to_unuse(unsigned int, unsigned int, unsigned long);
+#else
+static inline void preswap_shrink(unsigned long target_pages)
+{
+}
+
+static inline int preswap_test(struct swap_info_struct *sis,
+       unsigned long offset)
+{
+       return 0;
+}
+
+static inline void preswap_init(unsigned type)
+{
+}
+
+static inline int preswap_put(struct page *page)
+{
+       return 0;
+}
+
+static inline int preswap_get(struct page *page)
+{
+       return 0;
+}
+
+static inline void preswap_flush(unsigned type, unsigned long offset)
+{
+}
+
+static inline void preswap_flush_area(unsigned type)
+{
+}
+#endif /* CONFIG_PRESWAP */
+
 struct swap_list_t {
        int head;       /* head of priority-ordered swapfile list */
        int next;       /* swapfile to be used next */
@@ -312,6 +367,8 @@
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
+extern struct swap_list_t swap_list;
+extern spinlock_t swap_lock;
 
 /* linux/mm/thrash.c */
 extern struct mm_struct * swap_token_mm;
--- linux-2.6.30/mm/preswap.c   1969-12-31 17:00:00.000000000 -0700
+++ linux-2.6.30-tmem/mm/preswap.c      2009-06-19 14:55:16.000000000 -0600
@@ -0,0 +1,274 @@
+/*
+ * linux/mm/preswap.c
+ *
+ * Implements a fast "preswap" on top of the transcendent memory ("tmem") API.
+ * When a swapdisk is enabled (with swapon), a "private persistent tmem pool"
+ * is created along with a bit-per-page preswap_map.  When swapping occurs
+ * and a page is about to be written to disk, a "put" into the pool may first
+ * be attempted by passing the pageframe to be swapped, along with a "handle"
+ * consisting of a pool_id, an object id, and an index.  Since the pool is of
+ * indeterminate size, the "put" may be rejected, in which case the page
+ * is swapped to disk as normal.  If the "put" is successful, the page is
+ * copied to tmem and the preswap_map records the success.  Later, when
+ * the page needs to be swapped in, the preswap_map is checked and, if set,
+ * the page may be obtained with a "get" operation.  Note that the swap
+ * subsystem is responsible for: maintaining coherency between the swapcache,
+ * preswap, and the swapdisk; for evicting stale pages from preswap; and for
+ * emptying preswap when swapoff is performed. The "flush page" and "flush
+ * object" actions are provided for this.
+ *
+ * Note that if a "duplicate put" is performed to overwrite a page and
+ * the "put" operation fails, the page (and old data) is flushed and lost.
+ * Also note that multiple accesses to a tmem pool may be concurrent and
+ * any ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/uaccess.h>
+#include <linux/tmem.h>
+
+static u32 preswap_poolid = -1; /* if negative, preswap will never call tmem */
+
+const unsigned long preswap_zero = 0, preswap_infinity = ~0UL; /* for sysctl */
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS              4
+#define SWIZ_MASK              ((1 << SWIZ_BITS) - 1)
+#define oswiz(_type, _ind)     ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind)            (_ind >> SWIZ_BITS)
+
+/*
+ * preswap_map test/set/clear operations (must be atomic)
+ */
+
+int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+       if (!sis->preswap_map)
+               return 0;
+       return test_bit(offset % BITS_PER_LONG,
+               &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_set(struct swap_info_struct *sis,
+                               unsigned long offset)
+{
+       if (!sis->preswap_map)
+               return;
+       set_bit(offset % BITS_PER_LONG,
+               &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_clear(struct swap_info_struct *sis,
+                               unsigned long offset)
+{
+       if (!sis->preswap_map)
+               return;
+       clear_bit(offset % BITS_PER_LONG,
+               &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+/*
+ * preswap tmem operations
+ */
+
+/* returns 1 if the page was successfully put into preswap, 0 if the page
+ * was declined, and -ERRNO for a specific error */
+int preswap_put(struct page *page)
+{
+       swp_entry_t entry = { .val = page_private(page), };
+       unsigned type = swp_type(entry);
+       pgoff_t offset = swp_offset(entry);
+       u64 ind64 = (u64)offset;
+       u32 ind = (u32)offset;
+       unsigned long pfn = page_to_pfn(page);
+       struct swap_info_struct *sis = get_swap_info_struct(type);
+       int dup = 0, ret;
+
+       if ((s32)preswap_poolid < 0)
+               return 0;
+       if (ind64 != ind)
+               return 0;
+       if (preswap_test(sis, offset))
+               dup = 1;
+       mb(); /* ensure page is quiescent; tmem may address it with an alias */
+       ret = (*tmem_ops->put_page)(preswap_poolid, oswiz(type, ind),
+               iswiz(ind), pfn);
+       if (ret == 1) {
+               preswap_set(sis, offset);
+               if (!dup)
+                       sis->preswap_pages++;
+       } else if (dup) {
+               /* failed dup put always results in an automatic flush of
+                * the (older) page from preswap */
+               preswap_clear(sis, offset);
+               sis->preswap_pages--;
+       }
+       return ret;
+}
+
+/* returns 1 if the page was successfully gotten from preswap, 0 if the page
+ * was not present (should never happen!), and -ERRNO for a specific error */
+int preswap_get(struct page *page)
+{
+       swp_entry_t entry = { .val = page_private(page), };
+       unsigned type = swp_type(entry);
+       pgoff_t offset = swp_offset(entry);
+       u64 ind64 = (u64)offset;
+       u32 ind = (u32)offset;
+       unsigned long pfn = page_to_pfn(page);
+       struct swap_info_struct *sis = get_swap_info_struct(type);
+       int ret;
+
+       if ((s32)preswap_poolid < 0)
+               return 0;
+       if (ind64 != ind)
+               return 0;
+       if (!preswap_test(sis, offset))
+               return 0;
+       ret = (*tmem_ops->get_page)(preswap_poolid, oswiz(type, ind),
+               iswiz(ind), pfn);
+       return ret;
+}
+
+/* flush a single page from preswap */
+void preswap_flush(unsigned type, unsigned long offset)
+{
+       u64 ind64 = (u64)offset;
+       u32 ind = (u32)offset;
+       struct swap_info_struct *sis = get_swap_info_struct(type);
+       int ret = 1;
+
+       if ((s32)preswap_poolid < 0)
+               return;
+       if (ind64 != ind)
+               return;
+       if (preswap_test(sis, offset)) {
+               ret = (*tmem_ops->flush_page)(preswap_poolid,
+                                       oswiz(type, ind), iswiz(ind));
+               sis->preswap_pages--;
+               preswap_clear(sis, offset);
+       }
+}
+
+/* flush all pages from the passed swaptype */
+void preswap_flush_area(unsigned type)
+{
+       struct swap_info_struct *sis = get_swap_info_struct(type);
+       int ind;
+
+       if ((s32)preswap_poolid < 0)
+               return;
+       for (ind = SWIZ_MASK; ind >= 0; ind--)
+               (void)(*tmem_ops->flush_object)(preswap_poolid,
+                       oswiz(type, ind));
+       sis->preswap_pages = 0;
+}
+
+void preswap_init(unsigned type)
+{
+       /* only need one tmem pool for all swap types */
+       if ((s32)preswap_poolid >= 0)
+               return;
+       if (tmem_ops == NULL)
+               return;
+       preswap_poolid = (*tmem_ops->new_pool)(0, 0, TMEM_POOL_PERSIST);
+}
+
+/*
+ * preswap infrastructure functions
+ */
+
+/* code structure leveraged from sys_swapoff */
+void preswap_shrink(unsigned long target_pages)
+{
+       struct swap_info_struct *si = NULL;
+       unsigned long total_pages = 0, total_pages_to_unuse;
+       unsigned long pages = 0, unuse_pages = 0;
+       int type;
+       int wrapped = 0;
+
+       do {
+               /*
+                * we don't want to hold swap_lock while doing a very
+                * lengthy try_to_unuse, but swap_list may change
+                * so restart scan from swap_list.head each time
+                */
+               spin_lock(&swap_lock);
+               total_pages = 0;
+               for (type = swap_list.head; type >= 0; type = si->next) {
+                       si = get_swap_info_struct(type);
+                       total_pages += si->preswap_pages;
+               }
+               if (total_pages <= target_pages) {
+                       spin_unlock(&swap_lock);
+                       return;
+               }
+               total_pages_to_unuse = total_pages - target_pages;
+               for (type = swap_list.head; type >= 0; type = si->next) {
+                       si = get_swap_info_struct(type);
+                       if (total_pages_to_unuse < si->preswap_pages)
+                               pages = unuse_pages = total_pages_to_unuse;
+                       else {
+                               pages = si->preswap_pages;
+                               unuse_pages = 0; /* unuse all */
+                       }
+                       if (security_vm_enough_memory(pages))
+                               continue;
+                       vm_unacct_memory(pages);
+                       break;
+               }
+               spin_unlock(&swap_lock);
+               if (type < 0)
+                       return;
+               current->flags |= PF_SWAPOFF;
+               (void)try_to_unuse(type, 1, unuse_pages);
+               current->flags &= ~PF_SWAPOFF;
+               wrapped++;
+       } while (wrapped <= 3);
+}
+
+
+#ifdef CONFIG_SYSCTL
+/* cat /sys/proc/vm/preswap provides total number of pages in preswap
+ * across all swaptypes.  echo N > /sys/proc/vm/preswap attempts to shrink
+ * preswap page usage to N (usually 0) */
+int preswap_sysctl_handler(ctl_table *table, int write,
+       struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+       unsigned long npages;
+       int type;
+       unsigned long totalpages = 0;
+       struct swap_info_struct *si = NULL;
+
+       /* modeled after hugetlb_sysctl_handler in mm/hugetlb.c */
+       if (!write) {
+               spin_lock(&swap_lock);
+               for (type = swap_list.head; type >= 0; type = si->next) {
+                       si = get_swap_info_struct(type);
+                       totalpages += si->preswap_pages;
+               }
+               spin_unlock(&swap_lock);
+               npages = totalpages;
+       }
+       table->data = &npages;
+       table->maxlen = sizeof(unsigned long);
+       proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+       if (write)
+               preswap_shrink(npages);
+
+       return 0;
+}
+#endif
--- linux-2.6.30/include/linux/sysctl.h 2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/include/linux/sysctl.h    2009-06-19 09:33:59.000000000 
-0600
@@ -205,6 +205,7 @@
        VM_PANIC_ON_OOM=33,     /* panic at out-of-memory */
        VM_VDSO_ENABLED=34,     /* map VDSO into new processes? */
        VM_MIN_SLAB=35,          /* Percent pages ignored by zone reclaim */
+       VM_PRESWAP_PAGES=36,    /* pages/target_pages in preswap */
 };
 
 
--- linux-2.6.30/kernel/sysctl.c        2009-06-09 21:05:27.000000000 -0600
+++ linux-2.6.30-tmem/kernel/sysctl.c   2009-06-19 09:33:59.000000000 -0600
@@ -1282,6 +1282,18 @@
                .proc_handler   = &scan_unevictable_handler,
        },
 #endif
+#ifdef CONFIG_PRESWAP
+       {
+               .ctl_name       = VM_PRESWAP_PAGES,
+               .procname       = "preswap",
+               .data           = NULL,
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = &preswap_sysctl_handler,
+               .extra1         = (void *)&preswap_zero,
+               .extra2         = (void *)&preswap_infinity,
+       },
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.