[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [IA64] TLB tracking



# HG changeset patch
# User awilliam@xxxxxxxxxxx
# Node ID 0c18c6009448284911dc6ec7c338051ed9471521
# Parent  435e2275ea62428ac60c27b1af585656d67ac6bb
[IA64] TLB tracking

Add tlb insert tracking to flush finer grained virtual address
range when a page is unmapped from a domain.
This functionality is enabled with a compile time option,
xen_ia64_tlb_track(default is y) and xen_ia64_tlb_track_cnt(default is n).

This patch forcuses on grant table mapping.
When page is unmapped, full vTLB flush is necessary.
By tracking tlb insert on grant mapped page, full vTLB flush
can be avoided.
Especially when vbd backend does only DMA, so dom0 doesn't insert tlb entry
on the grant mapped page. In such case any vTLB flush isn't needed.

Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
---
 xen/arch/ia64/Rules.mk                       |    8 
 xen/arch/ia64/xen/Makefile                   |    1 
 xen/arch/ia64/xen/domain.c                   |    7 
 xen/arch/ia64/xen/faults.c                   |    8 
 xen/arch/ia64/xen/mm.c                       |   76 +++-
 xen/arch/ia64/xen/tlb_track.c                |  506 +++++++++++++++++++++++++++
 xen/arch/ia64/xen/vcpu.c                     |   13 
 xen/arch/ia64/xen/vhpt.c                     |  123 ++++++
 xen/include/asm-ia64/domain.h                |   30 -
 xen/include/asm-ia64/linux-xen/asm/pgtable.h |   20 +
 xen/include/asm-ia64/p2m_entry.h             |   76 ++++
 xen/include/asm-ia64/perfc_defn.h            |   30 +
 xen/include/asm-ia64/tlb_track.h             |  152 ++++++++
 xen/include/asm-ia64/tlbflush.h              |    9 
 xen/include/asm-ia64/vcpu.h                  |    3 
 xen/include/asm-ia64/vcpumask.h              |   60 +++
 xen/include/asm-ia64/vhpt.h                  |    4 
 xen/include/public/arch-ia64.h               |    3 
 18 files changed, 1073 insertions(+), 56 deletions(-)

diff -r 435e2275ea62 -r 0c18c6009448 xen/arch/ia64/Rules.mk
--- a/xen/arch/ia64/Rules.mk    Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/arch/ia64/Rules.mk    Sat Oct 14 17:42:00 2006 -0600
@@ -7,6 +7,8 @@ no_warns ?= n
 no_warns ?= n
 xen_ia64_expose_p2m    ?= y
 xen_ia64_pervcpu_vhpt  ?= y
+xen_ia64_tlb_track     ?= y
+xen_ia64_tlb_track_cnt ?= n
 
 ifneq ($(COMPILE_ARCH),$(TARGET_ARCH))
 CROSS_COMPILE ?= /usr/local/sp_env/v2.2.5/i686/bin/ia64-unknown-linux-
@@ -44,6 +46,12 @@ ifeq ($(xen_ia64_pervcpu_vhpt),y)
 ifeq ($(xen_ia64_pervcpu_vhpt),y)
 CFLAGS += -DCONFIG_XEN_IA64_PERVCPU_VHPT
 endif
+ifeq ($(xen_ia64_tlb_track),y)
+CFLAGS += -DCONFIG_XEN_IA64_TLB_TRACK
+endif
+ifeq ($(xen_ia64_tlb_track_cnt),y)
+CFLAGS += -DCONFIG_TLB_TRACK_CNT
+endif
 ifeq ($(no_warns),y)
 CFLAGS += -Wa,--fatal-warnings -Werror -Wno-uninitialized
 endif
diff -r 435e2275ea62 -r 0c18c6009448 xen/arch/ia64/xen/Makefile
--- a/xen/arch/ia64/xen/Makefile        Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/arch/ia64/xen/Makefile        Sat Oct 14 17:42:00 2006 -0600
@@ -29,3 +29,4 @@ obj-y += xencomm.o
 obj-y += xencomm.o
 
 obj-$(crash_debug) += gdbstub.o
+obj-$(xen_ia64_tlb_track) += tlb_track.o
diff -r 435e2275ea62 -r 0c18c6009448 xen/arch/ia64/xen/domain.c
--- a/xen/arch/ia64/xen/domain.c        Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/arch/ia64/xen/domain.c        Sat Oct 14 17:42:00 2006 -0600
@@ -47,6 +47,7 @@
 #include <asm/dom_fw.h>
 #include <asm/shadow.h>
 #include <xen/guest_access.h>
+#include <asm/tlb_track.h>
 
 unsigned long dom0_size = 512*1024*1024;
 unsigned long dom0_align = 64*1024*1024;
@@ -390,6 +391,8 @@ int arch_domain_create(struct domain *d)
        DPRINTK("%s:%d domain %d pervcpu_vhpt %d\n",
                __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
 #endif
+       if (tlb_track_create(d) < 0)
+               goto fail_nomem1;
        d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
        if (d->shared_info == NULL)
            goto fail_nomem;
@@ -418,6 +421,8 @@ int arch_domain_create(struct domain *d)
        return 0;
 
 fail_nomem:
+       tlb_track_destroy(d);
+fail_nomem1:
        if (d->arch.mm.pgd != NULL)
            pgd_free(d->arch.mm.pgd);
        if (d->shared_info != NULL)
@@ -432,6 +437,8 @@ void arch_domain_destroy(struct domain *
            free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
        if (d->arch.shadow_bitmap != NULL)
                xfree(d->arch.shadow_bitmap);
+
+       tlb_track_destroy(d);
 
        /* Clear vTLB for the next domain.  */
        domain_flush_tlb_vhpt(d);
diff -r 435e2275ea62 -r 0c18c6009448 xen/arch/ia64/xen/faults.c
--- a/xen/arch/ia64/xen/faults.c        Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/arch/ia64/xen/faults.c        Sat Oct 14 17:42:00 2006 -0600
@@ -31,6 +31,7 @@
 #include <asm/asm-xsi-offsets.h>
 #include <asm/shadow.h>
 #include <asm/uaccess.h>
+#include <asm/p2m_entry.h>
 
 extern void die_if_kernel(char *str, struct pt_regs *regs, long err);
 /* FIXME: where these declarations shold be there ? */
@@ -202,8 +203,11 @@ void ia64_do_page_fault (unsigned long a
        fault = vcpu_translate(current,address,is_data,&pteval,&itir,&iha);
        if (fault == IA64_NO_FAULT || fault == IA64_USE_TLB) {
                struct p2m_entry entry;
-               pteval = translate_domain_pte(pteval, address, itir, &logps, 
&entry);
-               vcpu_itc_no_srlz(current,is_data?2:1,address,pteval,-1UL,logps);
+               unsigned long m_pteval;
+               m_pteval = translate_domain_pte(pteval, address, itir,
+                                               &logps, &entry);
+               vcpu_itc_no_srlz(current, (is_data? 2: 1) | 4, 
+                                address, m_pteval, pteval, logps, &entry);
                if ((fault == IA64_USE_TLB && !current->arch.dtlb.pte.p) ||
                    p2m_entry_retry(&entry)) {
                        /* dtlb has been purged in-between.  This dtlb was
diff -r 435e2275ea62 -r 0c18c6009448 xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c    Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/arch/ia64/xen/mm.c    Sat Oct 14 17:42:00 2006 -0600
@@ -172,13 +172,15 @@
 #include <asm/vhpt.h>
 #include <asm/vcpu.h>
 #include <asm/shadow.h>
+#include <asm/p2m_entry.h>
+#include <asm/tlb_track.h>
 #include <linux/efi.h>
 #include <xen/guest_access.h>
 #include <asm/page.h>
 #include <public/memory.h>
 
 static void domain_page_flush(struct domain* d, unsigned long mpaddr,
-                              unsigned long old_mfn, unsigned long new_mfn);
+                              volatile pte_t* ptep, pte_t old_pte);
 
 extern unsigned long ia64_iobase;
 
@@ -798,12 +800,15 @@ flags_to_prot (unsigned long flags)
 
     res |= flags & ASSIGN_readonly ? _PAGE_AR_R: _PAGE_AR_RWX;
     res |= flags & ASSIGN_nocache ? _PAGE_MA_UC: _PAGE_MA_WB;
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+    res |= flags & ASSIGN_tlb_track ? _PAGE_TLB_TRACKING: 0;
+#endif
     
     return res;
 }
 
 /* map a physical address to the specified metaphysical addr */
-// flags: currently only ASSIGN_readonly, ASSIGN_nocache
+// flags: currently only ASSIGN_readonly, ASSIGN_nocache, ASSIGN_tlb_tack
 // This is called by assign_domain_mmio_page().
 // So accessing to pte is racy.
 int
@@ -1034,7 +1039,7 @@ assign_domain_mach_page(struct domain *d
 // caller must call set_gpfn_from_mfn() before call if necessary.
 // because set_gpfn_from_mfn() result must be visible before pte xchg
 // caller must use memory barrier. NOTE: xchg has acquire semantics.
-// flags: currently only ASSIGN_readonly
+// flags: ASSIGN_xxx
 static void
 assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
                            unsigned long mfn, unsigned long flags)
@@ -1068,7 +1073,7 @@ assign_domain_page_replace(struct domain
                 set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
             }
 
-            domain_page_flush(d, mpaddr, old_mfn, mfn);
+            domain_page_flush(d, mpaddr, pte, old_pte);
 
             try_to_clear_PGC_allocate(d, old_page);
             put_page(old_page);
@@ -1088,7 +1093,7 @@ assign_domain_page_cmpxchg_rel(struct do
     struct mm_struct *mm = &d->arch.mm;
     volatile pte_t* pte;
     unsigned long old_mfn;
-    unsigned long old_arflags;
+    unsigned long old_prot;
     pte_t old_pte;
     unsigned long new_mfn;
     unsigned long new_prot;
@@ -1098,12 +1103,12 @@ assign_domain_page_cmpxchg_rel(struct do
     pte = lookup_alloc_domain_pte(d, mpaddr);
 
  again:
-    old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
+    old_prot = pte_val(*pte) & ~_PAGE_PPN_MASK;
     old_mfn = page_to_mfn(old_page);
-    old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
+    old_pte = pfn_pte(old_mfn, __pgprot(old_prot));
     if (!pte_present(old_pte)) {
-        DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
-                __func__, pte_val(old_pte), old_arflags, old_mfn);
+        DPRINTK("%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx\n",
+                __func__, pte_val(old_pte), old_prot, old_mfn);
         return -EINVAL;
     }
 
@@ -1118,10 +1123,10 @@ assign_domain_page_cmpxchg_rel(struct do
             goto again;
         }
 
-        DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
+        DPRINTK("%s: old_pte 0x%lx old_prot 0x%lx old_mfn 0x%lx "
                 "ret_pte 0x%lx ret_mfn 0x%lx\n",
                 __func__,
-                pte_val(old_pte), old_arflags, old_mfn,
+                pte_val(old_pte), old_prot, old_mfn,
                 pte_val(ret_pte), pte_pfn(ret_pte));
         return -EINVAL;
     }
@@ -1133,7 +1138,7 @@ assign_domain_page_cmpxchg_rel(struct do
 
     set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
 
-    domain_page_flush(d, mpaddr, old_mfn, new_mfn);
+    domain_page_flush(d, mpaddr, pte, old_pte);
     put_page(old_page);
     perfc_incrc(assign_domain_pge_cmpxchg_rel);
     return 0;
@@ -1202,7 +1207,7 @@ zap_domain_page_one(struct domain *d, un
         set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
     }
 
-    domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
+    domain_page_flush(d, mpaddr, pte, old_pte);
 
     if (page_get_owner(page) != NULL) {
         try_to_clear_PGC_allocate(d, page);
@@ -1417,8 +1422,12 @@ create_grant_host_mapping(unsigned long 
     BUG_ON(ret == 0);
     BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
            get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
-    assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
-                                              ASSIGN_readonly: 
ASSIGN_writable);
+    assign_domain_page_replace(d, gpaddr, mfn,
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+                               ASSIGN_tlb_track |
+#endif
+                               ((flags & GNTMAP_readonly) ?
+                                ASSIGN_readonly : ASSIGN_writable));
     perfc_incrc(create_grant_host_mapping);
     return GNTST_okay;
 }
@@ -1473,7 +1482,7 @@ destroy_grant_host_mapping(unsigned long
     }
     BUG_ON(pte_pfn(old_pte) != mfn);
 
-    domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
+    domain_page_flush(d, gpaddr, pte, old_pte);
 
     page = mfn_to_page(mfn);
     BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page) is 
not needed.
@@ -1645,12 +1654,43 @@ guest_physmap_remove_page(struct domain 
 //    flush finer range.
 static void
 domain_page_flush(struct domain* d, unsigned long mpaddr,
-                  unsigned long old_mfn, unsigned long new_mfn)
-{
+                  volatile pte_t* ptep, pte_t old_pte)
+{
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+    struct tlb_track_entry* entry;
+#endif
+
     if (shadow_mode_enabled(d))
         shadow_mark_page_dirty(d, mpaddr >> PAGE_SHIFT);
 
+#ifndef CONFIG_XEN_IA64_TLB_TRACK
     domain_flush_vtlb_all();
+#else
+    switch (tlb_track_search_and_remove(d->arch.tlb_track,
+                                        ptep, old_pte, &entry)) {
+    case TLB_TRACK_NOT_TRACKED:
+        // DPRINTK("%s TLB_TRACK_NOT_TRACKED\n", __func__);
+        domain_flush_vtlb_all();
+        break;
+    case TLB_TRACK_NOT_FOUND:
+        /* do nothing */
+        // DPRINTK("%s TLB_TRACK_NOT_FOUND\n", __func__);
+        break;
+    case TLB_TRACK_FOUND:
+        // DPRINTK("%s TLB_TRACK_FOUND\n", __func__);
+        domain_flush_vtlb_track_entry(d, entry);
+        tlb_track_free_entry(d->arch.tlb_track, entry);
+        break;
+    case TLB_TRACK_MANY:
+        DPRINTK("%s TLB_TRACK_MANY\n", __func__);
+        domain_flush_vtlb_all();
+        break;
+    case TLB_TRACK_AGAIN:
+        DPRINTK("%s TLB_TRACK_AGAIN\n", __func__);
+        BUG();
+        break;
+    }
+#endif
     perfc_incrc(domain_page_flush);
 }
 
diff -r 435e2275ea62 -r 0c18c6009448 xen/arch/ia64/xen/vcpu.c
--- a/xen/arch/ia64/xen/vcpu.c  Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/arch/ia64/xen/vcpu.c  Sat Oct 14 17:42:00 2006 -0600
@@ -24,6 +24,8 @@
 #include <asm/bundle.h>
 #include <asm/privop_stat.h>
 #include <asm/uaccess.h>
+#include <asm/p2m_entry.h>
+#include <asm/tlb_track.h>
 
 /* FIXME: where these declarations should be there ? */
 extern void getreg(unsigned long regnum, unsigned long *val, int *nat, struct 
pt_regs *regs);
@@ -2007,7 +2009,9 @@ IA64FAULT vcpu_set_dtr(VCPU *vcpu, u64 s
  VCPU translation cache access routines
 **************************************************************************/
 
-void vcpu_itc_no_srlz(VCPU *vcpu, UINT64 IorD, UINT64 vaddr, UINT64 pte, 
UINT64 mp_pte, UINT64 logps)
+void
+vcpu_itc_no_srlz(VCPU *vcpu, UINT64 IorD, UINT64 vaddr, UINT64 pte,
+                 UINT64 mp_pte, UINT64 logps, struct p2m_entry* entry)
 {
        unsigned long psr;
        unsigned long ps = (vcpu->domain==dom0) ? logps : PAGE_SHIFT;
@@ -2020,6 +2024,7 @@ void vcpu_itc_no_srlz(VCPU *vcpu, UINT64
                              "smaller page size!\n");
 
        BUG_ON(logps > PAGE_SHIFT);
+       vcpu_tlb_track_insert_or_dirty(vcpu, vaddr, entry);
        psr = ia64_clear_ic();
        ia64_itc(IorD,vaddr,pte,ps); // FIXME: look for bigger mappings
        ia64_set_psr(psr);
@@ -2037,7 +2042,7 @@ void vcpu_itc_no_srlz(VCPU *vcpu, UINT64
        // PAGE_SIZE mapping in the vhpt for now, else purging is complicated
        else vhpt_insert(vaddr,pte,PAGE_SHIFT<<2);
 #endif
-       if ((mp_pte == -1UL) || (IorD & 0x4)) // don't place in 1-entry TLB
+       if (IorD & 0x4) /* don't place in 1-entry TLB */
                return;
        if (IorD & 0x1) {
                vcpu_set_tr_entry(&PSCBX(vcpu,itlb),mp_pte,ps<<2,vaddr);
@@ -2062,7 +2067,7 @@ again:
        pteval = translate_domain_pte(pte, ifa, itir, &logps, &entry);
        if (!pteval) return IA64_ILLOP_FAULT;
        if (swap_rr0) set_one_rr(0x0,PSCB(vcpu,rrs[0]));
-       vcpu_itc_no_srlz(vcpu,2,ifa,pteval,pte,logps);
+       vcpu_itc_no_srlz(vcpu, 2, ifa, pteval, pte, logps, &entry);
        if (swap_rr0) set_metaphysical_rr0();
        if (p2m_entry_retry(&entry)) {
                vcpu_flush_tlb_vhpt_range(ifa, logps);
@@ -2085,7 +2090,7 @@ again:
        pteval = translate_domain_pte(pte, ifa, itir, &logps, &entry);
        if (!pteval) return IA64_ILLOP_FAULT;
        if (swap_rr0) set_one_rr(0x0,PSCB(vcpu,rrs[0]));
-       vcpu_itc_no_srlz(vcpu, 1,ifa,pteval,pte,logps);
+       vcpu_itc_no_srlz(vcpu, 1, ifa, pteval, pte, logps, &entry);
        if (swap_rr0) set_metaphysical_rr0();
        if (p2m_entry_retry(&entry)) {
                vcpu_flush_tlb_vhpt_range(ifa, logps);
diff -r 435e2275ea62 -r 0c18c6009448 xen/arch/ia64/xen/vhpt.c
--- a/xen/arch/ia64/xen/vhpt.c  Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/arch/ia64/xen/vhpt.c  Sat Oct 14 17:42:00 2006 -0600
@@ -18,6 +18,7 @@
 #include <asm/page.h>
 #include <asm/vhpt.h>
 #include <asm/vcpu.h>
+#include <asm/vcpumask.h>
 #include <asm/vmmu.h>
 
 /* Defined in tlb.c  */
@@ -42,12 +43,14 @@ local_vhpt_flush(void)
 local_vhpt_flush(void)
 {
        __vhpt_flush(__ia64_per_cpu_var(vhpt_paddr));
+       perfc_incrc(local_vhpt_flush);
 }
 
 static void
 vcpu_vhpt_flush(struct vcpu* v)
 {
        __vhpt_flush(vcpu_vhpt_maddr(v));
+       perfc_incrc(vcpu_vhpt_flush);
 }
 
 static void
@@ -170,6 +173,39 @@ pervcpu_vhpt_free(struct vcpu *v)
 }
 #endif
 
+void
+domain_purge_swtc_entries(struct domain *d)
+{
+       struct vcpu* v;
+       for_each_vcpu(d, v) {
+               if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
+                       continue;
+
+               /* Purge TC entries.
+                  FIXME: clear only if match.  */
+               vcpu_purge_tr_entry(&PSCBX(v,dtlb));
+               vcpu_purge_tr_entry(&PSCBX(v,itlb));
+       }
+}
+
+void
+domain_purge_swtc_entries_vcpu_dirty_mask(struct domain* d,
+                                          vcpumask_t vcpu_dirty_mask)
+{
+       int vcpu;
+
+       for_each_vcpu_mask(vcpu, vcpu_dirty_mask) {
+               struct vcpu* v = d->vcpu[vcpu];
+               if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
+                       continue;
+
+               /* Purge TC entries.
+                  FIXME: clear only if match.  */
+               vcpu_purge_tr_entry(&PSCBX(v, dtlb));
+               vcpu_purge_tr_entry(&PSCBX(v, itlb));
+       }
+}
+
 // SMP: we can't assume v == current, vcpu might move to another physical cpu.
 // So memory barrier is necessary.
 // if we can guranttee that vcpu can run on only this physical cpu
@@ -292,15 +328,7 @@ void domain_flush_vtlb_range (struct dom
        }
 #endif
 
-       for_each_vcpu (d, v) {
-               if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
-                       continue;
-
-               /* Purge TC entries.
-                  FIXME: clear only if match.  */
-               vcpu_purge_tr_entry(&PSCBX(v,dtlb));
-               vcpu_purge_tr_entry(&PSCBX(v,itlb));
-       }
+       domain_purge_swtc_entries(d);
        smp_mb();
 
        for_each_vcpu (d, v) {
@@ -327,6 +355,83 @@ void domain_flush_vtlb_range (struct dom
        perfc_incrc(domain_flush_vtlb_range);
 }
 
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+#include <asm/tlb_track.h>
+#include <asm/vmx_vcpu.h>
+void
+__domain_flush_vtlb_track_entry(struct domain* d,
+                                const struct tlb_track_entry* entry)
+{
+       unsigned long rr7_rid;
+       int swap_rr0 = 0;
+       unsigned long old_rid;
+       unsigned long vaddr = entry->vaddr;
+       struct vcpu* v;
+       int cpu;
+       int vcpu;
+
+       BUG_ON((vaddr >> VRN_SHIFT) != VRN7);
+       /*
+        * heuristic:
+        * dom0linux accesses grant mapped pages via the kernel
+        * straight mapped area and it doesn't change rr7 rid. 
+        * So it is likey that rr7 == entry->rid so that
+        * we can avoid rid change.
+        * When blktap is supported, this heuristic should be revised.
+        */
+       vcpu_get_rr(current, VRN7 << VRN_SHIFT, &rr7_rid);
+       if (likely(rr7_rid == entry->rid)) {
+               perfc_incrc(tlb_track_use_rr7);
+       } else {
+               swap_rr0 = 1;
+               vaddr = (vaddr << 3) >> 3;// force vrn0
+               perfc_incrc(tlb_track_swap_rr0);
+       }
+
+       // tlb_track_entry_printf(entry);
+       if (swap_rr0) {
+               vcpu_get_rr(current, 0, &old_rid);
+               vcpu_set_rr(current, 0, entry->rid);
+       }
+    
+       if (HAS_PERVCPU_VHPT(d)) {
+               for_each_vcpu_mask(vcpu, entry->vcpu_dirty_mask) {
+                       v = d->vcpu[vcpu];
+                       if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
+                               continue;
+
+                       /* Invalidate VHPT entries.  */
+                       vcpu_flush_vhpt_range(v, vaddr, PAGE_SIZE);
+               }
+       } else {
+               for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
+                       /* Invalidate VHPT entries.  */
+                       cpu_flush_vhpt_range(cpu, vaddr, PAGE_SIZE);
+               }
+       }
+       /* ptc.ga has release semantics. */
+
+       /* ptc.ga  */
+       ia64_global_tlb_purge(vaddr, vaddr + PAGE_SIZE, PAGE_SHIFT);
+
+       if (swap_rr0) {
+               vcpu_set_rr(current, 0, old_rid);
+       }
+       perfc_incrc(domain_flush_vtlb_track_entry);
+}
+
+void
+domain_flush_vtlb_track_entry(struct domain* d,
+                              const struct tlb_track_entry* entry)
+{
+       domain_purge_swtc_entries_vcpu_dirty_mask(d, entry->vcpu_dirty_mask);
+       smp_mb();
+
+       __domain_flush_vtlb_track_entry(d, entry);
+}
+
+#endif
+
 static void flush_tlb_vhpt_all (struct domain *d)
 {
        /* First VHPT.  */
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/asm-ia64/domain.h
--- a/xen/include/asm-ia64/domain.h     Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/include/asm-ia64/domain.h     Sat Oct 14 17:42:00 2006 -0600
@@ -13,28 +13,10 @@
 #include <asm/fpswa.h>
 #include <xen/rangeset.h>
 
-struct p2m_entry {
-    volatile pte_t*     pte;
-    pte_t               used;
-};
-
-static inline void
-p2m_entry_set(struct p2m_entry* entry, volatile pte_t* pte, pte_t used)
-{
-    entry->pte  = pte;
-    entry->used = used;
-}
-
-static inline int
-p2m_entry_retry(struct p2m_entry* entry)
-{
-    //XXX see lookup_domain_pte().
-    //    NULL is set for invalid gpaddr for the time being.
-    if (entry->pte == NULL)
-        return 0;
-
-    return (pte_val(*entry->pte) != pte_val(entry->used));
-}
+struct p2m_entry;
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+struct tlb_track;
+#endif
 
 extern void domain_relinquish_resources(struct domain *);
 struct vcpu;
@@ -140,6 +122,10 @@ struct arch_domain {
     struct last_vcpu last_vcpu[NR_CPUS];
 
     struct arch_vmx_domain arch_vmx; /* Virtual Machine Extensions */
+
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+    struct tlb_track*   tlb_track;
+#endif
 };
 #define INT_ENABLE_OFFSET(v)             \
     (sizeof(vcpu_info_t) * (v)->vcpu_id + \
diff -r 435e2275ea62 -r 0c18c6009448 
xen/include/asm-ia64/linux-xen/asm/pgtable.h
--- a/xen/include/asm-ia64/linux-xen/asm/pgtable.h      Sat Oct 14 16:42:15 
2006 -0600
+++ b/xen/include/asm-ia64/linux-xen/asm/pgtable.h      Sat Oct 14 17:42:00 
2006 -0600
@@ -69,6 +69,26 @@
 #define _PAGE_VIRT_D           (__IA64_UL(1) << 53)    /* Virtual dirty bit */
 #define _PAGE_PROTNONE         0
 
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+#define _PAGE_TLB_TRACKING_BIT          54
+#define _PAGE_TLB_INSERTED_BIT          55
+#define _PAGE_TLB_INSERTED_MANY_BIT     56
+
+#define _PAGE_TLB_TRACKING              (1UL << _PAGE_TLB_TRACKING_BIT)
+#define _PAGE_TLB_INSERTED              (1UL << _PAGE_TLB_INSERTED_BIT)
+#define _PAGE_TLB_INSERTED_MANY         (1UL << _PAGE_TLB_INSERTED_MANY_BIT)
+#define _PAGE_TLB_TRACK_MASK            (_PAGE_TLB_TRACKING |          \
+                                         _PAGE_TLB_INSERTED |          \
+                                         _PAGE_TLB_INSERTED_MANY)
+
+#define pte_tlb_tracking(pte)                          \
+    ((pte_val(pte) & _PAGE_TLB_TRACKING) != 0)
+#define pte_tlb_inserted(pte)                          \
+    ((pte_val(pte) & _PAGE_TLB_INSERTED) != 0)
+#define pte_tlb_inserted_many(pte)                     \
+    ((pte_val(pte) & _PAGE_TLB_INSERTED_MANY) != 0)
+#endif // CONFIG_XEN_IA64_TLB_TRACK
+
 /* domVTI */
 #define GPFN_MEM               (0UL << 60)     /* Guest pfn is normal mem */
 #define GPFN_FRAME_BUFFER      (1UL << 60)     /* VGA framebuffer */
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/asm-ia64/perfc_defn.h
--- a/xen/include/asm-ia64/perfc_defn.h Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/include/asm-ia64/perfc_defn.h Sat Oct 14 17:42:00 2006 -0600
@@ -109,9 +109,12 @@ PERFPRIVOPADDR(thash)
 #endif
 
 // vhpt.c
+PERFCOUNTER_CPU(local_vhpt_flush,               "local_vhpt_flush")
+PERFCOUNTER_CPU(vcpu_vhpt_flush,                "vcpu_vhpt_flush")
 PERFCOUNTER_CPU(vcpu_flush_vtlb_all,            "vcpu_flush_vtlb_all")
 PERFCOUNTER_CPU(domain_flush_vtlb_all,          "domain_flush_vtlb_all")
 PERFCOUNTER_CPU(vcpu_flush_tlb_vhpt_range,      "vcpu_flush_tlb_vhpt_range")
+PERFCOUNTER_CPU(domain_flush_vtlb_track_entry,  
"domain_flush_vtlb_track_entry")
 PERFCOUNTER_CPU(domain_flush_vtlb_range,        "domain_flush_vtlb_range")
 
 // domain.c
@@ -134,3 +137,30 @@ PERFCOUNTER_CPU(domain_page_flush,      
 // dom0vp
 PERFCOUNTER_CPU(dom0vp_phystomach,              "dom0vp_phystomach")
 PERFCOUNTER_CPU(dom0vp_machtophys,              "dom0vp_machtophys")
+
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+// insert or dirty
+PERFCOUNTER_CPU(tlb_track_iod,                  "tlb_track_iod")
+PERFCOUNTER_CPU(tlb_track_iod_again,            "tlb_track_iod_again")
+PERFCOUNTER_CPU(tlb_track_iod_not_tracked,      "tlb_track_iod_not_tracked")
+PERFCOUNTER_CPU(tlb_track_iod_force_many,       "tlb_track_iod_force_many")
+PERFCOUNTER_CPU(tlb_track_iod_tracked_many,     "tlb_track_iod_tracked_many")
+PERFCOUNTER_CPU(tlb_track_iod_tracked_many_del, 
"tlb_track_iod_tracked_many_del")
+PERFCOUNTER_CPU(tlb_track_iod_found,            "tlb_track_iod_found")
+PERFCOUNTER_CPU(tlb_track_iod_new_entry,        "tlb_track_iod_new_entry")
+PERFCOUNTER_CPU(tlb_track_iod_new_failed,       "tlb_track_iod_new_failed")
+PERFCOUNTER_CPU(tlb_track_iod_new_many,         "tlb_track_iod_new_many")
+PERFCOUNTER_CPU(tlb_track_iod_insert,           "tlb_track_iod_insert")
+PERFCOUNTER_CPU(tlb_track_iod_dirtied,          "tlb_track_iod_dirtied")
+
+// search and remove
+PERFCOUNTER_CPU(tlb_track_sar,                  "tlb_track_sar")
+PERFCOUNTER_CPU(tlb_track_sar_not_tracked,      "tlb_track_sar_not_tracked")
+PERFCOUNTER_CPU(tlb_track_sar_not_found,        "tlb_track_sar_not_found")
+PERFCOUNTER_CPU(tlb_track_sar_found,            "tlb_track_sar_found")
+PERFCOUNTER_CPU(tlb_track_sar_many,             "tlb_track_sar_many")
+
+// flush
+PERFCOUNTER_CPU(tlb_track_use_rr7,              "tlb_track_use_rr7")
+PERFCOUNTER_CPU(tlb_track_swap_rr0,             "tlb_track_swap_rr0")
+#endif
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/asm-ia64/tlbflush.h
--- a/xen/include/asm-ia64/tlbflush.h   Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/include/asm-ia64/tlbflush.h   Sat Oct 14 17:42:00 2006 -0600
@@ -22,6 +22,15 @@ void domain_flush_vtlb_all (void);
 /* Global range-flush of vTLB.  */
 void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range);
 
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+struct tlb_track_entry;
+void __domain_flush_vtlb_track_entry(struct domain* d,
+                                     const struct tlb_track_entry* entry);
+/* Global entry-flush of vTLB */
+void domain_flush_vtlb_track_entry(struct domain* d,
+                                   const struct tlb_track_entry* entry);
+#endif
+
 /* Flush vhpt and mTLB on every dirty cpus.  */
 void domain_flush_tlb_vhpt(struct domain *d);
 
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/asm-ia64/vcpu.h
--- a/xen/include/asm-ia64/vcpu.h       Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/include/asm-ia64/vcpu.h       Sat Oct 14 17:42:00 2006 -0600
@@ -161,7 +161,8 @@ extern void vcpu_set_next_timer(VCPU *vc
 extern void vcpu_set_next_timer(VCPU *vcpu);
 extern BOOLEAN vcpu_timer_expired(VCPU *vcpu);
 extern UINT64 vcpu_deliverable_interrupts(VCPU *vcpu);
-extern void vcpu_itc_no_srlz(VCPU *vcpu, UINT64, UINT64, UINT64, UINT64, 
UINT64);
+struct p2m_entry;
+extern void vcpu_itc_no_srlz(VCPU *vcpu, UINT64, UINT64, UINT64, UINT64, 
UINT64, struct p2m_entry*);
 extern UINT64 vcpu_get_tmp(VCPU *, UINT64);
 extern void vcpu_set_tmp(VCPU *, UINT64, UINT64);
 
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/asm-ia64/vhpt.h
--- a/xen/include/asm-ia64/vhpt.h       Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/include/asm-ia64/vhpt.h       Sat Oct 14 17:42:00 2006 -0600
@@ -18,6 +18,10 @@
 
 #ifndef __ASSEMBLY__
 #include <xen/percpu.h>
+#include <asm/vcpumask.h>
+
+extern void domain_purge_swtc_entries(struct domain *d);
+extern void domain_purge_swtc_entries_vcpu_dirty_mask(struct domain* d, 
vcpumask_t vcpu_dirty_mask);
 
 //
 // VHPT Long Format Entry (as recognized by hw)
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/public/arch-ia64.h
--- a/xen/include/public/arch-ia64.h    Sat Oct 14 16:42:15 2006 -0600
+++ b/xen/include/public/arch-ia64.h    Sat Oct 14 17:42:00 2006 -0600
@@ -358,6 +358,9 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_guest_conte
 /* Internal only: memory attribute must be WC/UC/UCE.  */
 #define _ASSIGN_nocache                 1
 #define ASSIGN_nocache                  (1UL << _ASSIGN_nocache)
+// tlb tracking
+#define _ASSIGN_tlb_track               2
+#define ASSIGN_tlb_track                (1UL << _ASSIGN_tlb_track)
 
 /* This structure has the same layout of struct ia64_boot_param, defined in
    <asm/system.h>.  It is redefined here to ease use.  */
diff -r 435e2275ea62 -r 0c18c6009448 xen/arch/ia64/xen/tlb_track.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/ia64/xen/tlb_track.c     Sat Oct 14 17:42:00 2006 -0600
@@ -0,0 +1,506 @@
+/******************************************************************************
+ * tlb_track.c
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <asm/tlb_track.h>
+#include <asm/p2m_entry.h>
+#include <asm/vmx_mm_def.h>  /* for IA64_RR_SHIFT */
+#include <asm/vmx_vcpu.h>    /* for VRN7 */
+#include <asm/vcpu.h>        /* for PSCB() */
+
+#define CONFIG_TLB_TRACK_DEBUG
+#ifdef CONFIG_TLB_TRACK_DEBUG
+# define tlb_track_printd(fmt, ...)     \
+    printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__)
+#else
+# define tlb_track_printd(fmt, ...)     do { } while (0)
+#endif
+
+static int
+tlb_track_allocate_entries(struct tlb_track* tlb_track)
+{
+    struct page_info* entry_page;
+    struct tlb_track_entry* track_entries;
+    unsigned int allocated;
+    unsigned long i;
+
+    BUG_ON(tlb_track->num_free > 0);
+    if (tlb_track->num_entries >= tlb_track->limit) {
+        DPRINTK("%s: num_entries %d limit %d\n",
+                __func__, tlb_track->num_entries, tlb_track->limit);
+        return -ENOMEM;
+    }
+    entry_page = alloc_domheap_page(NULL);
+    if (entry_page == NULL) {
+        DPRINTK("%s: domheap page failed. num_entries %d limit %d\n",
+                __func__, tlb_track->num_entries, tlb_track->limit);
+        return -ENOMEM;
+    }
+
+    list_add(&entry_page->list, &tlb_track->page_list);
+    track_entries = (struct tlb_track_entry*)page_to_virt(entry_page);
+    allocated = PAGE_SIZE / sizeof(track_entries[0]);
+    tlb_track->num_entries += allocated;
+    tlb_track->num_free += allocated;
+    for (i = 0; i < allocated; i++) {
+        list_add(&track_entries[i].list, &tlb_track->free_list);
+        // tlb_track_printd("track_entries[%ld] 0x%p\n", i, &track_entries[i]);
+    }
+    tlb_track_printd("allocated %d num_entries %d num_free %d\n",
+                     allocated, tlb_track->num_entries, tlb_track->num_free);
+    return 0;
+}
+
+
+int
+tlb_track_create(struct domain* d)
+{
+    struct tlb_track* tlb_track = NULL;
+    struct page_info* hash_page = NULL;
+    unsigned int hash_size;
+    unsigned int hash_shift;
+    unsigned int i;
+
+    tlb_track = xmalloc(struct tlb_track);
+    if (tlb_track == NULL)
+        goto out;
+
+    hash_page = alloc_domheap_page(NULL);
+    if (hash_page == NULL)
+        goto out;
+
+    spin_lock_init(&tlb_track->free_list_lock);
+    INIT_LIST_HEAD(&tlb_track->free_list);
+    tlb_track->limit = TLB_TRACK_LIMIT_ENTRIES;
+    tlb_track->num_entries = 0;
+    tlb_track->num_free = 0;
+    INIT_LIST_HEAD(&tlb_track->page_list);
+    if (tlb_track_allocate_entries(tlb_track) < 0)
+        goto out;
+
+    spin_lock_init(&tlb_track->hash_lock);
+    /* XXX hash size optimization */
+    hash_size = PAGE_SIZE / sizeof(tlb_track->hash[0]);
+    for (hash_shift = 0; (1 << (hash_shift + 1)) < hash_size; hash_shift++)
+        /* nothing */;
+    tlb_track->hash_size = (1 << hash_shift);
+    tlb_track->hash_shift = hash_shift;
+    tlb_track->hash_mask = (1 << hash_shift) - 1;
+    tlb_track->hash = page_to_virt(hash_page);
+    for (i = 0; i < tlb_track->hash_size; i++)
+        INIT_LIST_HEAD(&tlb_track->hash[i]);
+
+    smp_mb(); /* make initialization visible before use. */
+    d->arch.tlb_track = tlb_track;
+    printk("%s:%d hash 0x%p hash_size %d \n",
+           __func__, __LINE__, tlb_track->hash, tlb_track->hash_size);
+
+    return 0;
+
+out:
+    if (hash_page != NULL)
+        free_domheap_page(hash_page);
+
+    if (tlb_track != NULL)
+        xfree(tlb_track);
+
+    return -ENOMEM;
+}
+
+void
+tlb_track_destroy(struct domain* d)
+{
+    struct tlb_track* tlb_track = d->arch.tlb_track;
+    struct page_info* page;
+    struct page_info* next;
+
+    spin_lock(&tlb_track->free_list_lock);
+    BUG_ON(tlb_track->num_free != tlb_track->num_entries);
+
+    list_for_each_entry_safe(page, next, &tlb_track->page_list, list) {
+        list_del(&page->list);
+        free_domheap_page(page);
+    }
+
+    free_domheap_page(virt_to_page(tlb_track->hash));
+    xfree(tlb_track);
+    // d->tlb_track = NULL;
+}
+
+static struct tlb_track_entry*
+tlb_track_get_entry(struct tlb_track* tlb_track)
+{
+    struct tlb_track_entry* entry = NULL;
+    spin_lock(&tlb_track->free_list_lock);
+    if (tlb_track->num_free == 0)
+        (void)tlb_track_allocate_entries(tlb_track);
+
+    if (tlb_track->num_free > 0) {
+        BUG_ON(list_empty(&tlb_track->free_list));
+        entry = list_entry(tlb_track->free_list.next,
+                           struct tlb_track_entry, list);
+        tlb_track->num_free--;
+        list_del(&entry->list);
+    }
+    spin_unlock(&tlb_track->free_list_lock);
+    return entry;
+}
+
+void
+tlb_track_free_entry(struct tlb_track* tlb_track,
+                     struct tlb_track_entry* entry)
+{
+    spin_lock(&tlb_track->free_list_lock);
+    list_add(&entry->list, &tlb_track->free_list);
+    tlb_track->num_free++;
+    spin_unlock(&tlb_track->free_list_lock);
+}
+
+
+#include <linux/hash.h>
+/* XXX hash function. */
+static struct list_head*
+tlb_track_hash_head(struct tlb_track* tlb_track, volatile pte_t* ptep)
+{
+    unsigned long hash = hash_long((unsigned long)ptep, tlb_track->hash_shift);
+    BUG_ON(hash >= tlb_track->hash_size);
+    BUG_ON((hash & tlb_track->hash_mask) != hash);
+    return &tlb_track->hash[hash];
+}
+
+static int
+tlb_track_pte_zapped(pte_t old_pte, pte_t ret_pte)
+{
+    if (pte_pfn(old_pte) != pte_pfn(ret_pte) ||
+        (pte_val(old_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK)) !=
+        (pte_val(ret_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK))) {
+        /* Other thread zapped the p2m entry. */
+        return 1;
+    }
+    return 0;
+}
+
+static TLB_TRACK_RET_T
+tlb_track_insert_or_dirty(struct tlb_track* tlb_track, struct mm_struct* mm,
+                          volatile pte_t* ptep, pte_t old_pte,
+                          unsigned long vaddr, unsigned long rid)
+{
+    unsigned long mfn = pte_pfn(old_pte);
+    struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
+    struct tlb_track_entry* entry;
+    struct tlb_track_entry* new_entry = NULL;
+    unsigned long bit_to_be_set = _PAGE_TLB_INSERTED;
+    pte_t new_pte;
+    pte_t ret_pte;
+
+    struct vcpu* v = current;
+    TLB_TRACK_RET_T ret = TLB_TRACK_NOT_FOUND;
+
+#if 0 /* this is done at vcpu_tlb_track_insert_or_dirty() */
+    perfc_incrc(tlb_track_iod);
+    if (!pte_tlb_tracking(old_pte)) {
+        perfc_incrc(tlb_track_iod_not_tracked);
+        return TLB_TRACK_NOT_TRACKED;
+    }
+#endif
+    if (pte_tlb_inserted_many(old_pte)) {
+        perfc_incrc(tlb_track_iod_tracked_many);
+        return TLB_TRACK_MANY;
+    }
+
+    /* vaddr must be normalized so that it is in vrn7 and page aligned. */
+    BUG_ON((vaddr >> IA64_RR_SHIFT) != VRN7);
+    BUG_ON((vaddr & ~PAGE_MASK) != 0);
+#if 0
+    tlb_track_printd("\n"
+                     "\tmfn 0x%016lx\n"
+                     "\told_pte 0x%016lx ptep 0x%p\n"
+                     "\tptep_val 0x%016lx vaddr 0x%016lx rid %ld\n"
+                     "\ttlb_track 0x%p head 0x%p\n",
+                     mfn,
+                     pte_val(old_pte), ptep, pte_val(*ptep),
+                     vaddr, rid,
+                     tlb_track, head);
+#endif
+
+ again:
+    /*
+     * zapping side may zap the p2m entry and then remove tlb track entry
+     * non-atomically. We may see the stale tlb track entry here.
+     * p2m_entry_retry() handles such a case.
+     * Or other thread may zap the p2m entry and remove tlb track entry
+     * and inserted new tlb track entry.
+     */
+    spin_lock(&tlb_track->hash_lock);
+    list_for_each_entry(entry, head, list) {
+        if (entry->ptep != ptep)
+            continue;
+
+        if (pte_pfn(entry->pte_val) == mfn) {
+            // tlb_track_entry_printf(entry);
+            if (entry->vaddr == vaddr && entry->rid == rid) {
+                // tlb_track_printd("TLB_TRACK_FOUND\n");
+                ret = TLB_TRACK_FOUND;
+                perfc_incrc(tlb_track_iod_found);
+#ifdef CONFIG_TLB_TRACK_CNT
+                entry->cnt++;
+                if (entry->cnt > TLB_TRACK_CNT_FORCE_MANY) {
+                    /*
+                     * heuristics:
+                     * If a page is used to transfer data by dev channel,
+                     * it would be unmapped with small amount access
+                     * (once or twice tlb insert) after real device
+                     * I/O completion. It would be short period.
+                     * However this page seems to be accessed many times.
+                     * We guess that this page is used I/O ring
+                     * so that tracking this entry might be useless.
+                     */
+                     // tlb_track_entry_printf(entry);
+                     // tlb_track_printd("cnt = %ld\n", entry->cnt);
+                    perfc_incrc(tlb_track_iod_force_many);
+                    goto force_many;
+                }
+#endif
+                goto found;
+            } else {
+#ifdef CONFIG_TLB_TRACK_CNT
+            force_many:
+#endif
+                if (!pte_tlb_inserted(old_pte)) {
+                    printk("%s:%d racy update\n", __func__, __LINE__);
+                    old_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED);
+                }
+                new_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED_MANY);
+                ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, new_pte);
+                if (pte_val(ret_pte) != pte_val(old_pte)) {
+                    // tlb_track_printd("TLB_TRACK_AGAIN\n");
+                    ret = TLB_TRACK_AGAIN;
+                    perfc_incrc(tlb_track_iod_again);
+                } else {
+                    // tlb_track_printd("TLB_TRACK_MANY del entry 0x%p\n",
+                    //                  entry);
+                    ret = TLB_TRACK_MANY;
+                    list_del(&entry->list);
+                    // tlb_track_entry_printf(entry);
+                    perfc_incrc(tlb_track_iod_tracked_many_del);
+                }
+                goto out;
+            }
+        }
+
+        /*
+         * Other thread changed the p2m entry and removed and inserted new
+         * tlb tracn entry after we get old_pte, but before we get
+         * spinlock.
+         */
+        // tlb_track_printd("TLB_TRACK_AGAIN\n");
+        ret = TLB_TRACK_AGAIN;
+        perfc_incrc(tlb_track_iod_again);
+        goto out;
+    }
+
+    entry = NULL; // prevent freeing entry.
+    if (pte_tlb_inserted(old_pte)) {
+        /* Other thread else removed the tlb_track_entry after we got old_pte
+           before we got spin lock. */
+        ret = TLB_TRACK_AGAIN;
+        perfc_incrc(tlb_track_iod_again);
+        goto out;
+    }
+    if (new_entry == NULL && bit_to_be_set == _PAGE_TLB_INSERTED) {
+        spin_unlock(&tlb_track->hash_lock);
+        new_entry = tlb_track_get_entry(tlb_track);
+        if (new_entry == NULL) {
+            tlb_track_printd("get_entry failed\n");
+            /* entry can't be allocated.
+               fall down into full flush mode. */
+            bit_to_be_set |= _PAGE_TLB_INSERTED_MANY;
+            perfc_incrc(tlb_track_iod_new_failed);
+        }
+        // tlb_track_printd("new_entry 0x%p\n", new_entry);
+        perfc_incrc(tlb_track_iod_new_entry);
+        goto again;
+    }
+
+    BUG_ON(pte_tlb_inserted_many(old_pte));
+    new_pte = __pte(pte_val(old_pte) | bit_to_be_set);
+    ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, new_pte);
+    if (pte_val(old_pte) != pte_val(ret_pte)) {
+        if (tlb_track_pte_zapped(old_pte, ret_pte)) {
+            // tlb_track_printd("zapped TLB_TRACK_AGAIN\n");
+            ret = TLB_TRACK_AGAIN;
+            perfc_incrc(tlb_track_iod_again);
+            goto out;
+        }
+
+        /* Other thread set _PAGE_TLB_INSERTED and/or _PAGE_TLB_INSERTED_MANY 
*/
+        if (pte_tlb_inserted_many(ret_pte)) {
+            /* Other thread already set _PAGE_TLB_INSERTED_MANY and
+               removed the entry. */
+            // tlb_track_printd("iserted TLB_TRACK_MANY\n");
+            BUG_ON(!pte_tlb_inserted(ret_pte));
+            ret = TLB_TRACK_MANY;
+            perfc_incrc(tlb_track_iod_new_many);
+            goto out;
+        }
+        BUG_ON(pte_tlb_inserted(ret_pte));
+        BUG();
+    }
+    if (new_entry) {
+        // tlb_track_printd("iserting new_entry 0x%p\n", new_entry);
+        entry = new_entry;
+        new_entry = NULL;
+
+        entry->ptep = ptep;
+        entry->pte_val = old_pte;
+        entry->vaddr = vaddr;
+        entry->rid = rid;
+        cpus_clear(entry->pcpu_dirty_mask);
+        vcpus_clear(entry->vcpu_dirty_mask);
+        list_add(&entry->list, head);
+
+#ifdef CONFIG_TLB_TRACK_CNT
+        entry->cnt = 0;
+#endif
+        perfc_incrc(tlb_track_iod_insert);
+        // tlb_track_entry_printf(entry);
+    } else {
+        goto out;
+    }
+
+ found:
+    BUG_ON(v->processor >= NR_CPUS);
+    cpu_set(v->processor, entry->pcpu_dirty_mask);
+    BUG_ON(v->vcpu_id >= NR_CPUS);
+    vcpu_set(v->vcpu_id, entry->vcpu_dirty_mask);
+    perfc_incrc(tlb_track_iod_dirtied);
+
+ out:
+    spin_unlock(&tlb_track->hash_lock);
+    if (ret == TLB_TRACK_MANY && entry != NULL)
+        tlb_track_free_entry(tlb_track, entry);
+    if (new_entry != NULL)
+        tlb_track_free_entry(tlb_track, new_entry);
+    return ret;
+}
+
+void
+__vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
+                                 struct p2m_entry* entry)
+{
+    unsigned long vrn = vaddr >> IA64_RR_SHIFT;
+    unsigned long rid = PSCB(vcpu, rrs[vrn]);
+    TLB_TRACK_RET_T ret;
+
+    /* normalize vrn7
+       When linux dom0 case, vrn7 is the most common case. */
+    vaddr |= VRN7 << VRN_SHIFT;
+    vaddr &= PAGE_MASK;
+    ret = tlb_track_insert_or_dirty(vcpu->domain->arch.tlb_track,
+                                    &vcpu->domain->arch.mm,
+                                    entry->ptep, entry->used,
+                                    vaddr, rid);
+    if (ret == TLB_TRACK_AGAIN)
+        p2m_entry_set_retry(entry);
+}
+
+TLB_TRACK_RET_T
+tlb_track_search_and_remove(struct tlb_track* tlb_track,
+                            volatile pte_t* ptep, pte_t old_pte,
+                            struct tlb_track_entry** entryp)
+{
+    unsigned long mfn = pte_pfn(old_pte);
+    struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
+    struct tlb_track_entry* entry;
+
+    perfc_incrc(tlb_track_sar);
+    if (!pte_tlb_tracking(old_pte)) {
+        perfc_incrc(tlb_track_sar_not_tracked);
+        return TLB_TRACK_NOT_TRACKED;
+    }
+    if (!pte_tlb_inserted(old_pte)) {
+        BUG_ON(pte_tlb_inserted_many(old_pte));
+        perfc_incrc(tlb_track_sar_not_found);
+        return TLB_TRACK_NOT_FOUND;
+    }
+    if (pte_tlb_inserted_many(old_pte)) {
+        BUG_ON(!pte_tlb_inserted(old_pte));
+        perfc_incrc(tlb_track_sar_many);
+        return TLB_TRACK_MANY;
+    }
+
+    spin_lock(&tlb_track->hash_lock);
+    list_for_each_entry(entry, head, list) {
+        if (entry->ptep != ptep)
+            continue;
+
+        if (pte_pfn(entry->pte_val) == mfn) {
+            list_del(&entry->list);
+            spin_unlock(&tlb_track->hash_lock);
+            *entryp = entry;
+            perfc_incrc(tlb_track_sar_found);
+            // tlb_track_entry_printf(entry);
+#ifdef CONFIG_TLB_TRACK_CNT
+            // tlb_track_printd("cnt = %ld\n", entry->cnt);
+#endif
+            return TLB_TRACK_FOUND;
+        }
+        BUG();
+    }
+    BUG();
+    spin_unlock(&tlb_track->hash_lock);
+    return TLB_TRACK_NOT_TRACKED;
+}
+
+/* for debug */
+void
+__tlb_track_entry_printf(const char* func, int line,
+                         const struct tlb_track_entry* entry)
+{
+    char pcpumask_buf[NR_CPUS + 1];
+    char vcpumask_buf[MAX_VIRT_CPUS + 1];
+    cpumask_scnprintf(pcpumask_buf, sizeof(pcpumask_buf),
+                      entry->pcpu_dirty_mask);
+    vcpumask_scnprintf(vcpumask_buf, sizeof(vcpumask_buf),
+                       entry->vcpu_dirty_mask);
+    printk("%s:%d\n"
+           "\tmfn 0x%016lx\n"
+           "\told_pte 0x%016lx ptep 0x%p\n"
+           "\tpte_val 0x%016lx vaddr 0x%016lx rid %ld\n"
+           "\tpcpu_dirty_mask %s vcpu_dirty_mask %s\n"
+           "\tentry 0x%p\n",
+           func, line,
+           pte_pfn(entry->pte_val),
+           pte_val(entry->pte_val), entry->ptep, pte_val(*entry->ptep),
+           entry->vaddr, entry->rid,
+           pcpumask_buf, vcpumask_buf,
+           entry);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/asm-ia64/p2m_entry.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-ia64/p2m_entry.h  Sat Oct 14 17:42:00 2006 -0600
@@ -0,0 +1,76 @@
+/******************************************************************************
+ * p2m_entry.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifndef __ASM_P2M_ENTRY_H__
+#define __ASM_P2M_ENTRY_H__
+
+#include <asm/pgtable.h>
+
+struct p2m_entry {
+#define P2M_PTE_ALWAYS_RETRY   ((volatile pte_t*) -1)
+    volatile pte_t*     ptep;
+    pte_t               used;
+};
+
+static inline void
+p2m_entry_set(struct p2m_entry* entry, volatile pte_t* ptep, pte_t used)
+{
+    entry->ptep = ptep;
+    entry->used = used;
+}
+
+static inline void
+p2m_entry_set_retry(struct p2m_entry* entry)
+{
+    entry->ptep = P2M_PTE_ALWAYS_RETRY;
+}
+
+static inline int
+p2m_entry_retry(struct p2m_entry* entry)
+{
+    /* XXX see lookup_domain_pte().
+       NULL is set for invalid gpaddr for the time being. */
+    if (entry->ptep == NULL)
+        return 0;
+
+    if (entry->ptep == P2M_PTE_ALWAYS_RETRY)
+        return 1;
+
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+    return ((pte_val(*entry->ptep) & ~_PAGE_TLB_TRACK_MASK) !=
+            (pte_val(entry->used) & ~_PAGE_TLB_TRACK_MASK));
+#else
+    return (pte_val(*entry->ptep) != pte_val(entry->used));
+#endif
+}
+
+#endif // __ASM_P2M_ENTRY_H__
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/asm-ia64/tlb_track.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-ia64/tlb_track.h  Sat Oct 14 17:42:00 2006 -0600
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * tlb_track.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifndef __TLB_TRACK_H__
+#define __TLB_TRACK_H__
+
+#ifdef CONFIG_XEN_IA64_TLB_TRACK
+
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <asm/domain.h>
+#include <xen/list.h>
+#include <asm/p2m_entry.h>
+#include <asm/vcpumask.h>
+
+// TODO: compact this structure.
+struct tlb_track_entry {
+    struct list_head   list;
+
+    volatile pte_t*     ptep;           // corresponding p2m entry
+
+    /* XXX should we use TR_ENTRY? */
+    pte_t               pte_val;        // mfn and other flags
+                                        // pte_val.p = 1:
+                                        //   tlb entry is inserted.
+                                        // pte_val.p = 0: 
+                                        //   once tlb entry is inserted, so
+                                        //   this entry is created. But tlb
+                                        //   purge is isseued, so this
+                                        //   virtual address need not to be
+                                        //   purged.
+    unsigned long       vaddr;          // virtual address
+    unsigned long       rid;            // rid
+
+    cpumask_t           pcpu_dirty_mask;
+    vcpumask_t          vcpu_dirty_mask;
+    // tlbflush_timestamp;
+
+#ifdef CONFIG_TLB_TRACK_CNT
+#define TLB_TRACK_CNT_FORCE_MANY        256 /* XXX how many? */
+    unsigned long       cnt;
+#endif
+};
+
+struct tlb_track {
+
+/* see __gnttab_map_grant_ref()
+   A domain can map granted-page up to MAPTRACK_MAX_ENTRIES pages. */
+#define TLB_TRACK_LIMIT_ENTRIES                                     \
+    (MAPTRACK_MAX_ENTRIES * (PAGE_SIZE / sizeof(struct tlb_track)))
+
+    spinlock_t                  free_list_lock;
+    struct list_head            free_list;
+    unsigned int                limit;
+    unsigned int                num_entries;
+    unsigned int                num_free;
+    struct list_head            page_list;
+
+    /* XXX hash table size */
+    spinlock_t                  hash_lock;
+    unsigned int                hash_size;
+    unsigned int                hash_shift;
+    unsigned int                hash_mask;
+    struct list_head*           hash;
+};
+
+int tlb_track_create(struct domain* d);
+void tlb_track_destroy(struct domain* d);
+
+void tlb_track_free_entry(struct tlb_track* tlb_track,
+                          struct tlb_track_entry* entry);
+
+void
+__vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
+                                 struct p2m_entry* entry);
+static inline void
+vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
+                               struct p2m_entry* entry)
+{
+    /* optimization.
+       non-tracking pte is most common. */
+    perfc_incrc(tlb_track_iod);
+    if (!pte_tlb_tracking(entry->used)) {
+        perfc_incrc(tlb_track_iod_not_tracked);
+        return;
+    }
+
+    __vcpu_tlb_track_insert_or_dirty(vcpu, vaddr, entry);
+}
+
+
+/* return value
+ * NULL if this entry is used
+ * entry if this entry isn't used
+ */
+enum TLB_TRACK_RET {
+    TLB_TRACK_NOT_TRACKED,
+    TLB_TRACK_NOT_FOUND,
+    TLB_TRACK_FOUND,
+    TLB_TRACK_MANY,
+    TLB_TRACK_AGAIN,
+};
+typedef enum TLB_TRACK_RET TLB_TRACK_RET_T;
+
+TLB_TRACK_RET_T
+tlb_track_search_and_remove(struct tlb_track* tlb_track, 
+                            volatile pte_t* ptep, pte_t old_pte, 
+                            struct tlb_track_entry** entryp);
+
+void
+__tlb_track_entry_printf(const char* func, int line,
+                         const struct tlb_track_entry* entry);
+#define tlb_track_entry_printf(entry)                       \
+    __tlb_track_entry_printf(__func__, __LINE__, (entry))
+#else
+
+#define tlb_track_create(d)                                (0)
+#define tlb_track_destroy(d)                               do { } while (0)
+#define vcpu_tlb_track_insert_or_dirty(vcpu, vaddr, entry) do { } while (0)
+
+#endif /* CONFIG_XEN_IA64_TLB_TRACK */
+
+#endif /* __TLB_TRACK_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 435e2275ea62 -r 0c18c6009448 xen/include/asm-ia64/vcpumask.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/asm-ia64/vcpumask.h   Sat Oct 14 17:42:00 2006 -0600
@@ -0,0 +1,60 @@
+#ifndef __XEN_VCPUMASK_H
+#define __XEN_VCPUMASK_H
+
+/* vcpu mask
+   stolen from cpumask.h */
+typedef struct { DECLARE_BITMAP(bits, MAX_VIRT_CPUS); } vcpumask_t;
+
+#define vcpu_set(vcpu, dst) __vcpu_set((vcpu), &(dst))
+static inline void __vcpu_set(int vcpu, volatile vcpumask_t *dstp)
+{
+    set_bit(vcpu, dstp->bits);
+}
+#define vcpus_clear(dst) __vcpus_clear(&(dst), MAX_VIRT_CPUS)
+static inline void __vcpus_clear(vcpumask_t *dstp, int nbits)
+{
+    bitmap_zero(dstp->bits, nbits);
+}
+/* No static inline type checking - see Subtlety (1) above. */
+#define vcpu_isset(vcpu, vcpumask) test_bit((vcpu), (vcpumask).bits)
+
+#define first_vcpu(src) __first_vcpu(&(src), MAX_VIRT_CPUS)
+static inline int __first_vcpu(const vcpumask_t *srcp, int nbits)
+{
+    return min_t(int, nbits, find_first_bit(srcp->bits, nbits));
+}
+
+#define next_vcpu(n, src) __next_vcpu((n), &(src), MAX_VIRT_CPUS)
+static inline int __next_vcpu(int n, const vcpumask_t *srcp, int nbits)
+{
+    return min_t(int, nbits, find_next_bit(srcp->bits, nbits, n+1));
+}
+
+#if MAX_VIRT_CPUS > 1
+#define for_each_vcpu_mask(vcpu, mask)          \
+    for ((vcpu) = first_vcpu(mask);             \
+         (vcpu) < MAX_VIRT_CPUS;                \
+         (vcpu) = next_vcpu((vcpu), (mask)))
+#else /* NR_CPUS == 1 */
+#define for_each_vcpu_mask(vcpu, mask) for ((vcpu) = 0; (vcpu) < 1; (vcpu)++)
+#endif /* NR_CPUS */
+
+#define vcpumask_scnprintf(buf, len, src) \
+        __vcpumask_scnprintf((buf), (len), &(src), MAX_VIRT_CPUS)
+static inline int __vcpumask_scnprintf(char *buf, int len,
+                                       const vcpumask_t *srcp, int nbits)
+{
+    return bitmap_scnprintf(buf, len, srcp->bits, nbits);
+}
+
+#endif /* __XEN_VCPUMASK_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.