diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index b36a66e..0c9f12f 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -64,6 +64,8 @@ string_param("badpage", opt_badpage);
 static bool_t opt_bootscrub __initdata = 1;
 boolean_param("bootscrub", opt_bootscrub);
 
+static bool_t opt_nonsmt __initdata = 0;
+boolean_param("nonsmt", opt_nonsmt);
 /*
  * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held
  */
@@ -103,6 +105,7 @@ struct scrub_region {
     u64 start;
     u64 chunk_size;
     u64 cpu_block_size;
+    cpumask_t cpu;
 };
 static struct scrub_region __initdata region[MAX_NUMNODES];
 
@@ -1286,6 +1289,7 @@ void __init smp_scrub_heap_pages(void *data)
     /* Determine if we are scrubbing using the boot CPU */
     if ( region->cpu_block_size != ~0ULL )
         /* Determine the current CPU's index into CPU's linked to this node*/
+        /* TODO :Ignore the siblings! */
         for_each_cpu( temp_cpu, &node_to_cpumask(local_node) )
         {
             if ( cpu == temp_cpu )
@@ -1304,7 +1308,6 @@ void __init smp_scrub_heap_pages(void *data)
     else
         end_mfn = start_mfn + region->chunk_size;
 
-
     for ( mfn = start_mfn; mfn < end_mfn; mfn++ )
     {
         pg = mfn_to_page(mfn);
@@ -1313,10 +1316,9 @@ void __init smp_scrub_heap_pages(void *data)
         if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
             continue;
 
-        /* Every 100MB, print a progress dot. */
-        if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
+        /* Every 1G, print a progress dot. */
+        if ( (mfn % ((1000*1024*1024)/PAGE_SIZE)) == 0 )
             printk(".");
-
         /* Do the scrub if possible */
         if ( page_state_is(pg, free) )
             scrub_one_page(pg);
@@ -1331,23 +1333,26 @@ void __init smp_scrub_heap_pages(void *data)
  */
 void __init scrub_heap_pages(void)
 {
-    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
-    unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id();
+    cpumask_t node_cpus, node_cpus_nonsmt, total_node_cpus_mask = {{ 0 }};
+    unsigned int i, j,boot_cpu_node, total_cpus, cpu = smp_processor_id(), sibling;
     unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
     unsigned long mem_start, mem_end;
+    s_time_t start, end;
 
     if ( !opt_bootscrub )
         return;
 
     boot_cpu_node = cpu_to_node(cpu);
 
-    printk("Scrubbing Free RAM: ");
+    printk("Scrubbing Free RAM on %d nodes\n", num_online_nodes());
 
     /* Scrub block size */
     chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;
     if ( chunk_size == 0 )
         chunk_size = 1;
 
+    printk("CPUs have %d threads.\n", cpumask_weight(per_cpu(cpu_sibling_mask, 0)));
+    printk("CPUs have %d cores.\n", cpumask_weight(per_cpu(cpu_core_mask, 0)) / cpumask_weight(per_cpu(cpu_sibling_mask, 0)));
     /* Determine the amount of memory to scrub, per CPU on each Node */
     for_each_online_node ( i )
     {
@@ -1359,27 +1364,49 @@ void __init scrub_heap_pages(void)
         /* It's possible a node has no CPU's */
         if ( cpumask_empty(&node_cpus) )
             continue;
-        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus);
 
+        node_cpus_nonsmt = node_to_cpumask(i);
+        for_each_cpu(j, &node_cpus)
+        {
+            cpu = 0;
+            for_each_cpu(sibling, per_cpu(cpu_sibling_mask, j)) {
+                if (cpu++ == 0) /* Skip core */
+                    continue;
+                cpumask_clear_cpu(sibling, &node_cpus_nonsmt);
+            }
+        }
+        printk("node%d has %d CPUs non-SMT\n", i, cpumask_weight(&node_cpus_nonsmt));
+        for_each_cpu(j, &node_cpus_nonsmt)
+            printk("#%d,", j);
+
+        printk("\n");
+        if (opt_nonsmt)
+            cpumask_copy(&node_cpus, &node_cpus_nonsmt);
+
+        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus);
         region[i].cpu_block_size = (mem_end - mem_start) /
                                     cpumask_weight(&node_cpus);
         region[i].start = mem_start;
+        cpumask_copy(&region[i].cpu, &node_cpus);
+        printk("NODE%d scrubbing %lx PFNs spread across %d CPUs\n", i, mem_end - mem_start, cpumask_weight(&node_cpus));
 
         if ( region[i].cpu_block_size > max_cpu_blk_size )
             max_cpu_blk_size = region[i].cpu_block_size;
     }
-
+    cpu = smp_processor_id(); /* We re-used it in the loop. */
     /* Round default chunk size down if required */
     if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
         chunk_size = max_cpu_blk_size;
 
-    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
+    total_cpus = cpumask_weight(&total_node_cpus_mask);
+    printk("Using a total of %d CPUS.\n", total_cpus);
+    start = NOW();
     /* Start all CPU's scrubbing memory, chunk_size at a time */
     for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size )
     {
         process_pending_softirqs();
 
-        atomic_set(&bootscrub_count, total_node_cpus);
+        atomic_set(&bootscrub_count, total_cpus);
 
         spin_lock(&heap_lock);
 
@@ -1388,7 +1415,7 @@ void __init scrub_heap_pages(void)
         {
             region[i].chunk_size = chunk_size;
             region[i].offset = mfn_off;
-            node_cpus = node_to_cpumask(i);
+            cpumask_copy(&node_cpus, &region[i].cpu);
             /* Clear local cpu ID */
             cpumask_clear_cpu(cpu, &node_cpus);
             /* Start page scrubbing on all other CPU's */
@@ -1406,6 +1433,10 @@ void __init scrub_heap_pages(void)
         spin_unlock(&heap_lock);
     }
 
+    end = NOW();
+    printk("Done SMP scrubbing (%d seconds). Boot scrub on BSP:\n",
+           (u32)((end-start) >> 30));
+    start = NOW();
     /* Use the boot CPU to scrub any nodes which have no CPU's linked to them */
     for_each_online_node ( i )
     {
@@ -1416,6 +1447,7 @@ void __init scrub_heap_pages(void)
 
         mem_start = max(node_start_pfn(i), first_valid_mfn);
         mem_end = min(mem_start + node_spanned_pages(i), max_page);
+        printk("NODE%d scrubbing %lx->%lx\n", i, mem_start, mem_end);
 
         region[0].offset = 0;
         region[0].cpu_block_size = ~0ULL;
@@ -1435,7 +1467,8 @@ void __init scrub_heap_pages(void)
             process_pending_softirqs();
         }
     }
-    printk("done.\n");
+    end = NOW();
+    printk("done. (%d seconds)\n", (u32)(end - start) >> 30);
 
     /* Now that the heap is initialized, run checks and set bounds
      * for the low mem virq algorithm. */