[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Xen-changelog] Begin updating to 2.6.13 base

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
From: Xen patchbot -unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 15 Sep 2005 07:48:12 +0000
Delivery-date: Thu, 15 Sep 2005 07:49:52 +0000
List-id: BK change log <xen-changelog.lists.xensource.com>
# HG changeset patch
# User djm@xxxxxxxxxxxxxxx
# Node ID b7276814008c9c924fceecf6fd9f67ccddaadcb2
# Parent  44316ce8327754a7a70c80ffff551e7c4619e066
Begin updating to 2.6.13 base

diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/Makefile
--- a/xen/arch/ia64/Makefile    Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/Makefile    Wed Aug 31 20:32:27 2005
@@ -1,18 +1,21 @@
 include $(BASEDIR)/Rules.mk
 
-VPATH = linux linux-xen
+VPATH = linux linux-xen linux/lib
+#VPATH = linux-xen linux/lib
 
 # libs-y       += arch/ia64/lib/lib.a
 
 OBJS = xensetup.o setup.o time.o irq.o ia64_ksyms.o process.o smp.o \
-       xenmisc.o pdb-stub.o acpi.o hypercall.o \
+       xenmisc.o acpi.o hypercall.o \
        machvec.o dom0_ops.o domain.o hpsimserial.o pcdp.o \
        idle0_task.o pal.o hpsim.o efi.o efi_stub.o ivt.o mm_contig.o \
        xenmem.o sal.o cmdline.o mm_init.o tlb.o smpboot.o \
-       extable.o linuxextable.o xenirq.o xentime.o \
+       extable.o linuxextable.o sort.o xenirq.o xentime.o \
        regionreg.o entry.o unaligned.o privop.o vcpu.o \
        irq_ia64.o irq_lsapic.o vhpt.o xenasm.o hyperprivop.o dom_fw.o \
        grant_table.o sn_console.o
+
+#OBJS += idiv64.o idiv32.o                     \
 
 # TMP holder to contain *.0 moved out of CONFIG_VTI
 OBJS += vmx_init.o
@@ -22,6 +25,13 @@
        vmx_phy_mode.o vmx_utility.o vmx_interrupt.o vmx_entry.o vmmu.o \
        vtlb.o mmio.o vlsapic.o vmx_hypercall.o mm.o vmx_support.o pal_emul.o
 endif
+
+# files from xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
+OBJS +=        bitop.o clear_page.o flush.o copy_page_mck.o                    
\
+       memset.o strlen.o memcpy_mck.o                                  \
+       __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o                   \
+       __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
+
 # perfmon.o
 # unwind.o needed for kernel unwinding (rare)
 
@@ -30,8 +40,8 @@
 # remove following line if not privifying in memory
 # OBJS += privify.o
 
-default: $(OBJS) head.o ia64lib.o xen.lds.s
-       $(LD) -r -o arch.o $(OBJS) ia64lib.o
+default: $(OBJS) head.o xen.lds.s
+       $(LD) -r -o arch.o $(OBJS)
        $(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \
                -Map map.out head.o $(ALL_OBJS) -o $(TARGET)-syms
        $(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $(TARGET)
@@ -79,12 +89,29 @@
        $(CC) -E $(CPPFLAGS) -P -DXEN -D__ASSEMBLY__ \
                -o xen.lds.s xen.lds.S
 
-ia64lib.o:
-       $(MAKE) -C linux/lib && cp linux/lib/ia64lib.o .
+# variants of divide/modulo
+# see files in xen/arch/ia64/linux/lib (linux/arch/ia64/lib)
+__divdi3.o: idiv64.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
+__udivdi3.o: idiv64.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
+__moddi3.o: idiv64.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
+__umoddi3.o: idiv64.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
+__divsi3.o: idiv32.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
+__udivsi3.o: idiv32.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
+__modsi3.o: idiv32.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
+__umodsi3.o: idiv32.S
+       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
+
 
 clean:
        rm -f *.o *~ core  xen.lds.s 
$(BASEDIR)/include/asm-ia64/.offsets.h.stamp asm-offsets.s
        rm -f asm-xsi-offsets.s $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h
-       rm -f lib/*.o
+       rm -f linux/lib/*.o
 
 .PHONY: default clean
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/setup.c
--- a/xen/arch/ia64/linux-xen/setup.c   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/setup.c   Wed Aug 31 20:32:27 2005
@@ -4,10 +4,15 @@
  * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co
  *     David Mosberger-Tang <davidm@xxxxxxxxxx>
  *     Stephane Eranian <eranian@xxxxxxxxxx>
- * Copyright (C) 2000, Rohit Seth <rohit.seth@xxxxxxxxx>
+ * Copyright (C) 2000, 2004 Intel Corp
+ *     Rohit Seth <rohit.seth@xxxxxxxxx>
+ *     Suresh Siddha <suresh.b.siddha@xxxxxxxxx>
+ *     Gordon Jin <gordon.jin@xxxxxxxxx>
  * Copyright (C) 1999 VA Linux Systems
  * Copyright (C) 1999 Walt Drummond <drummond@xxxxxxxxxxx>
  *
+ * 12/26/04 S.Siddha, G.Jin, R.Seth
+ *                     Add multi-threading and multi-core detection
  * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo().
  * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map
  * 03/31/00 R.Seth     cpu_initialized and current->processor fixes
@@ -15,6 +20,7 @@
  * 02/01/00 R.Seth     fixed get_cpuinfo for SMP
  * 01/07/99 S.Eranian  added the support for command line argument
  * 06/24/99 W.Drummond added boot_cpu_data.
+ * 05/28/05 Z. Menyhart        Dynamic stride size for "flush_icache_range()"
  */
 #include <linux/config.h>
 #include <linux/module.h>
@@ -35,6 +41,10 @@
 #include <linux/serial_core.h>
 #include <linux/efi.h>
 #include <linux/initrd.h>
+#ifndef XEN
+#include <linux/platform.h>
+#include <linux/pm.h>
+#endif
 
 #include <asm/ia32.h>
 #include <asm/machvec.h>
@@ -51,8 +61,10 @@
 #include <asm/smp.h>
 #include <asm/system.h>
 #include <asm/unistd.h>
+#ifdef XEN
 #include <asm/vmx.h>
 #include <asm/io.h>
+#endif
 
 #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE)
 # error "struct cpuinfo_ia64 too big!"
@@ -64,12 +76,16 @@
 #endif
 
 DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
+#ifdef XEN
 DEFINE_PER_CPU(cpu_kr_ia64_t, cpu_kr);
+#endif
 DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
 DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
 unsigned long ia64_cycles_per_usec;
 struct ia64_boot_param *ia64_boot_param;
 struct screen_info screen_info;
+unsigned long vga_console_iobase;
+unsigned long vga_console_membase;
 
 unsigned long ia64_max_cacheline_size;
 unsigned long ia64_iobase;     /* virtual address for I/O accesses */
@@ -78,7 +94,12 @@
 EXPORT_SYMBOL(io_space);
 unsigned int num_io_spaces;
 
-unsigned char aux_device_present = 0xaa;        /* XXX remove this when legacy 
I/O is gone */
+/*
+ * "flush_icache_range()" needs to know what processor dependent stride size 
to use
+ * when it makes i-cache(s) coherent with d-caches.
+ */
+#define        I_CACHE_STRIDE_SHIFT    5       /* Safest way to go: 32 bytes 
by 32 bytes */
+unsigned long ia64_i_cache_stride_shift = ~0;
 
 /*
  * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 
1).  This
@@ -287,23 +308,25 @@
 static inline int __init
 early_console_setup (char *cmdline)
 {
+       int earlycons = 0;
+
 #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
        {
                extern int sn_serial_console_early_setup(void);
                if (!sn_serial_console_early_setup())
-                       return 0;
+                       earlycons++;
        }
 #endif
 #ifdef CONFIG_EFI_PCDP
        if (!efi_setup_pcdp_console(cmdline))
-               return 0;
+               earlycons++;
 #endif
 #ifdef CONFIG_SERIAL_8250_CONSOLE
        if (!early_serial_console_init(cmdline))
-               return 0;
-#endif
-
-       return -1;
+               earlycons++;
+#endif
+
+       return (earlycons) ? 0 : -1;
 }
 
 static inline void
@@ -315,7 +338,34 @@
 #endif
 }
 
-void __init
+#ifdef CONFIG_SMP
+static void
+check_for_logical_procs (void)
+{
+       pal_logical_to_physical_t info;
+       s64 status;
+
+       status = ia64_pal_logical_to_phys(0, &info);
+       if (status == -1) {
+               printk(KERN_INFO "No logical to physical processor mapping "
+                      "available\n");
+               return;
+       }
+       if (status) {
+               printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n",
+                      status);
+               return;
+       }
+       /*
+        * Total number of siblings that BSP has.  Though not all of them 
+        * may have booted successfully. The correct number of siblings 
+        * booted is in info.overview_num_log.
+        */
+       smp_num_siblings = info.overview_tpc;
+       smp_num_cpucores = info.overview_cpp;
+}
+#endif
+
 #ifdef XEN
 early_setup_arch (char **cmdline_p)
 #else
@@ -398,6 +448,19 @@
 
 #ifdef CONFIG_SMP
        cpu_physical_id(0) = hard_smp_processor_id();
+
+       cpu_set(0, cpu_sibling_map[0]);
+       cpu_set(0, cpu_core_map[0]);
+
+       check_for_logical_procs();
+       if (smp_num_cpucores > 1)
+               printk(KERN_INFO
+                      "cpu package is Multi-Core capable: number of 
cores=%d\n",
+                      smp_num_cpucores);
+       if (smp_num_siblings > 1)
+               printk(KERN_INFO
+                      "cpu package is Multi-Threading capable: number of 
siblings=%d\n",
+                      smp_num_siblings);
 #endif
 
 #ifdef XEN
@@ -505,12 +568,23 @@
                   "cpu regs   : %u\n"
                   "cpu MHz    : %lu.%06lu\n"
                   "itc MHz    : %lu.%06lu\n"
-                  "BogoMIPS   : %lu.%02lu\n\n",
+                  "BogoMIPS   : %lu.%02lu\n",
                   cpunum, c->vendor, family, c->model, c->revision, c->archrev,
                   features, c->ppn, c->number,
                   c->proc_freq / 1000000, c->proc_freq % 1000000,
                   c->itc_freq / 1000000, c->itc_freq % 1000000,
                   lpj*HZ/500000, (lpj*HZ/5000) % 100);
+#ifdef CONFIG_SMP
+       seq_printf(m, "siblings   : %u\n", c->num_log);
+       if (c->threads_per_core > 1 || c->cores_per_socket > 1)
+               seq_printf(m,
+                          "physical id: %u\n"
+                          "core id    : %u\n"
+                          "thread id  : %u\n",
+                          c->socket_id, c->core_id, c->thread_id);
+#endif
+       seq_printf(m,"\n");
+
        return 0;
 }
 
@@ -581,6 +655,14 @@
        memcpy(c->vendor, cpuid.field.vendor, 16);
 #ifdef CONFIG_SMP
        c->cpu = smp_processor_id();
+
+       /* below default values will be overwritten  by identify_siblings() 
+        * for Multi-Threading/Multi-Core capable cpu's
+        */
+       c->threads_per_core = c->cores_per_socket = c->num_log = 1;
+       c->socket_id = -1;
+
+       identify_siblings(c);
 #endif
        c->ppn = cpuid.field.ppn;
        c->number = cpuid.field.number;
@@ -611,6 +693,12 @@
        /* start_kernel() requires this... */
 }
 
+/*
+ * Calculate the max. cache line size.
+ *
+ * In addition, the minimum of the i-cache stride sizes is calculated for
+ * "flush_icache_range()".
+ */
 static void
 get_max_cacheline_size (void)
 {
@@ -624,6 +712,8 @@
                 printk(KERN_ERR "%s: ia64_pal_cache_summary() failed 
(status=%ld)\n",
                        __FUNCTION__, status);
                 max = SMP_CACHE_BYTES;
+               /* Safest setup for "flush_icache_range()" */
+               ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
                goto out;
         }
 
@@ -632,14 +722,31 @@
                                                    &cci);
                if (status != 0) {
                        printk(KERN_ERR
-                              "%s: ia64_pal_cache_config_info(l=%lu) failed 
(status=%ld)\n",
+                              "%s: ia64_pal_cache_config_info(l=%lu, 2) failed 
(status=%ld)\n",
                               __FUNCTION__, l, status);
                        max = SMP_CACHE_BYTES;
+                       /* The safest setup for "flush_icache_range()" */
+                       cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+                       cci.pcci_unified = 1;
                }
                line_size = 1 << cci.pcci_line_size;
                if (line_size > max)
                        max = line_size;
-        }
+               if (!cci.pcci_unified) {
+                       status = ia64_pal_cache_config_info(l,
+                                                   /* cache_type 
(instruction)= */ 1,
+                                                   &cci);
+                       if (status != 0) {
+                               printk(KERN_ERR
+                               "%s: ia64_pal_cache_config_info(l=%lu, 1) 
failed (status=%ld)\n",
+                                       __FUNCTION__, l, status);
+                               /* The safest setup for "flush_icache_range()" 
*/
+                               cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+                       }
+               }
+               if (cci.pcci_stride < ia64_i_cache_stride_shift)
+                       ia64_i_cache_stride_shift = cci.pcci_stride;
+       }
   out:
        if (max > ia64_max_cacheline_size)
                ia64_max_cacheline_size = max;
@@ -700,7 +807,17 @@
        ia64_set_kr(IA64_KR_FPU_OWNER, 0);
 
        /*
-        * Initialize default control register to defer all speculative faults. 
 The
+        * Initialize the page-table base register to a global
+        * directory with all zeroes.  This ensure that we can handle
+        * TLB-misses to user address-space even before we created the
+        * first user address-space.  This may happen, e.g., due to
+        * aggressive use of lfetch.fault.
+        */
+       ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page)));
+
+       /*
+        * Initialize default control register to defer speculative faults 
except
+        * for those arising from TLB misses, which are not deferred.  The
         * kernel MUST NOT depend on a particular setting of these bits (in 
other words,
         * the kernel must have recovery code for all speculative accesses).  
Turn on
         * dcr.lc as per recommendation by the architecture team.  Most IA-32 
apps
@@ -762,6 +879,9 @@
        /* size of physical stacked register partition plus 8 bytes: */
        __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
        platform_cpu_init();
+#ifndef XEN
+       pm_idle = default_idle;
+#endif
 }
 
 void
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/extable.c
--- a/xen/arch/ia64/linux/extable.c     Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/extable.c     Wed Aug 31 20:32:27 2005
@@ -6,29 +6,29 @@
  */
 
 #include <linux/config.h>
+#include <linux/sort.h>
 
 #include <asm/uaccess.h>
 #include <asm/module.h>
 
-static inline int
-compare_entries (struct exception_table_entry *l, struct exception_table_entry 
*r)
+static int cmp_ex(const void *a, const void *b)
 {
+       const struct exception_table_entry *l = a, *r = b;
        u64 lip = (u64) &l->addr + l->addr;
        u64 rip = (u64) &r->addr + r->addr;
 
+       /* avoid overflow */
+       if (lip > rip)
+               return 1;
        if (lip < rip)
                return -1;
-       if (lip == rip)
-               return 0;
-       else
-               return 1;
+       return 0;
 }
 
-static inline void
-swap_entries (struct exception_table_entry *l, struct exception_table_entry *r)
+static void swap_ex(void *a, void *b, int size)
 {
+       struct exception_table_entry *l = a, *r = b, tmp;
        u64 delta = (u64) r - (u64) l;
-       struct exception_table_entry tmp;
 
        tmp = *l;
        l->addr = r->addr + delta;
@@ -38,23 +38,20 @@
 }
 
 /*
- * Sort the exception table.  It's usually already sorted, but there may be 
unordered
- * entries due to multiple text sections (such as the .init text section).  
Note that the
- * exception-table-entries contain location-relative addresses, which requires 
a bit of
- * care during sorting to avoid overflows in the offset members (e.g., it 
would not be
- * safe to make a temporary copy of an exception-table entry on the stack, 
because the
- * stack may be more than 2GB away from the exception-table).
+ * Sort the exception table. It's usually already sorted, but there
+ * may be unordered entries due to multiple text sections (such as the
+ * .init text section). Note that the exception-table-entries contain
+ * location-relative addresses, which requires a bit of care during
+ * sorting to avoid overflows in the offset members (e.g., it would
+ * not be safe to make a temporary copy of an exception-table entry on
+ * the stack, because the stack may be more than 2GB away from the
+ * exception-table).
  */
-void
-sort_extable (struct exception_table_entry *start, struct 
exception_table_entry *finish)
+void sort_extable (struct exception_table_entry *start,
+                  struct exception_table_entry *finish)
 {
-       struct exception_table_entry *p, *q;
-
-       /* insertion sort */
-       for (p = start + 1; p < finish; ++p)
-               /* start .. p-1 is sorted; push p down to it's proper place */
-               for (q = p; q > start && compare_entries(&q[0], &q[-1]) < 0; 
--q)
-                       swap_entries(&q[0], &q[-1]);
+       sort(start, finish - start, sizeof(struct exception_table_entry),
+            cmp_ex, swap_ex);
 }
 
 const struct exception_table_entry *
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/ia64_ksyms.c
--- a/xen/arch/ia64/linux/ia64_ksyms.c  Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/ia64_ksyms.c  Wed Aug 31 20:32:27 2005
@@ -57,9 +57,6 @@
 EXPORT_SYMBOL(__strlen_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__strnlen_user);
-
-#include <asm/unistd.h>
-EXPORT_SYMBOL(__ia64_syscall);
 
 /* from arch/ia64/lib */
 extern void __divsi3(void);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/flush.S
--- a/xen/arch/ia64/linux/lib/flush.S   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/flush.S   Wed Aug 31 20:32:27 2005
@@ -1,39 +1,61 @@
 /*
  * Cache flushing routines.
  *
- * Copyright (C) 1999-2001 Hewlett-Packard Co
- * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
+ * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
+ *     David Mosberger-Tang <davidm@xxxxxxxxxx>
+ *
+ * 05/28/05 Zoltan Menyhart    Dynamic stride size
  */
+
 #include <asm/asmmacro.h>
-#include <asm/page.h>
+
 
        /*
         * flush_icache_range(start,end)
-        *      Must flush range from start to end-1 but nothing else (need to
+        *
+        *      Make i-cache(s) coherent with d-caches.
+        *
+        *      Must deal with range from start to end-1 but nothing else (need 
to
         *      be careful not to touch addresses that may be unmapped).
+        *
+        *      Note: "in0" and "in1" are preserved for debugging purposes.
         */
 GLOBAL_ENTRY(flush_icache_range)
+
        .prologue
-       alloc r2=ar.pfs,2,0,0,0
-       sub r8=in1,in0,1
+       alloc   r2=ar.pfs,2,0,0,0
+       movl    r3=ia64_i_cache_stride_shift
+       mov     r21=1
        ;;
-       shr.u r8=r8,5                   // we flush 32 bytes per iteration
-       .save ar.lc, r3
-       mov r3=ar.lc                    // save ar.lc
+       ld8     r20=[r3]                // r20: stride shift
+       sub     r22=in1,r0,1            // last byte address
+       ;;
+       shr.u   r23=in0,r20             // start / (stride size)
+       shr.u   r22=r22,r20             // (last byte address) / (stride size)
+       shl     r21=r21,r20             // r21: stride size of the i-cache(s)
+       ;;
+       sub     r8=r22,r23              // number of strides - 1
+       shl     r24=r23,r20             // r24: addresses for "fc.i" =
+                                       //      "start" rounded down to stride 
boundary
+       .save   ar.lc,r3
+       mov     r3=ar.lc                // save ar.lc
        ;;
 
        .body
-
-       mov ar.lc=r8
+       mov     ar.lc=r8
        ;;
-.Loop: fc in0                          // issuable on M0 only
-       add in0=32,in0
+       /*
+        * 32 byte aligned loop, even number of (actually 2) bundles
+        */
+.Loop: fc.i    r24                     // issuable on M0 only
+       add     r24=r21,r24             // we flush "stride size" bytes per 
iteration
+       nop.i   0
        br.cloop.sptk.few .Loop
        ;;
        sync.i
        ;;
        srlz.i
        ;;
-       mov ar.lc=r3                    // restore ar.lc
+       mov     ar.lc=r3                // restore ar.lc
        br.ret.sptk.many rp
 END(flush_icache_range)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memcpy_mck.S
--- a/xen/arch/ia64/linux/lib/memcpy_mck.S      Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/memcpy_mck.S      Wed Aug 31 20:32:27 2005
@@ -75,6 +75,7 @@
        mov     f6=f0
        br.cond.sptk .common_code
        ;;
+END(memcpy)
 GLOBAL_ENTRY(__copy_user)
        .prologue
 // check dest alignment
@@ -300,7 +301,7 @@
        add     src_pre_mem=0,src0      // prefetch src pointer
        add     dst_pre_mem=0,dst0      // prefetch dest pointer
        and     src0=-8,src0            // 1st src pointer
-(p7)   mov     ar.lc = r21
+(p7)   mov     ar.lc = cnt
 (p8)   mov     ar.lc = r0
        ;;
        TEXT_ALIGN(32)
@@ -524,7 +525,6 @@
 #undef B
 #undef C
 #undef D
-END(memcpy)
 
 /*
  * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memset.S
--- a/xen/arch/ia64/linux/lib/memset.S  Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/lib/memset.S  Wed Aug 31 20:32:27 2005
@@ -57,10 +57,10 @@
 { .mmi
        .prologue
        alloc   tmp = ar.pfs, 3, 0, 0, 0
-       .body
        lfetch.nt1 [dest]                       //
        .save   ar.lc, save_lc
        mov.i   save_lc = ar.lc
+       .body
 } { .mmi
        mov     ret0 = dest                     // return value
        cmp.ne  p_nz, p_zr = value, r0          // use stf.spill if value is 
zero
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/pcdp.h
--- a/xen/arch/ia64/linux/pcdp.h        Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/pcdp.h        Wed Aug 31 20:32:27 2005
@@ -2,7 +2,7 @@
  * Definitions for PCDP-defined console devices
  *
  * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf
- * v2.0:  http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf
+ * v2.0:  http://www.dig64.org/specifications/DIG64_PCDPv20.pdf
  *
  * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P.
  *     Khalid Aziz <khalid.aziz@xxxxxx>
@@ -52,11 +52,36 @@
        u32                             clock_rate;
        u8                              pci_prog_intfc;
        u8                              flags;
-};
+       u16                             conout_index;
+       u32                             reserved;
+} __attribute__((packed));
+
+#define PCDP_IF_PCI    1
+
+/* pcdp_if_pci.trans */
+#define PCDP_PCI_TRANS_IOPORT  0x02
+#define PCDP_PCI_TRANS_MMIO    0x01
+
+struct pcdp_if_pci {
+       u8                      interconnect;
+       u8                      reserved;
+       u16                     length;
+       u8                      segment;
+       u8                      bus;
+       u8                      dev;
+       u8                      fun;
+       u16                     dev_id;
+       u16                     vendor_id;
+       u32                     acpi_interrupt;
+       u64                     mmio_tra;
+       u64                     ioport_tra;
+       u8                      flags;
+       u8                      trans;
+} __attribute__((packed));
 
 struct pcdp_vga {
        u8                      count;          /* address space descriptors */
-};
+} __attribute__((packed));
 
 /* pcdp_device.flags */
 #define PCDP_PRIMARY_CONSOLE   1
@@ -66,7 +91,9 @@
        u8                      flags;
        u16                     length;
        u16                     efi_index;
-};
+       /* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */
+       /* next data is device specific type (currently only pcdp_vga) */
+} __attribute__((packed));
 
 struct pcdp {
        u8                      signature[4];
@@ -81,4 +108,4 @@
        u32                     num_uarts;
        struct pcdp_uart        uart[0];        /* actual size is num_uarts */
        /* remainder of table is pcdp_device structures */
-};
+} __attribute__((packed));
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/minstate.h
--- /dev/null   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/minstate.h        Wed Aug 31 20:32:27 2005
@@ -0,0 +1,254 @@
+#include <linux/config.h>
+
+#include <asm/cache.h>
+
+#include "entry.h"
+
+/*
+ * For ivt.s we want to access the stack virtually so we don't have to disable 
translation
+ * on interrupts.
+ *
+ *  On entry:
+ *     r1:     pointer to current task (ar.k6)
+ */
+#define MINSTATE_START_SAVE_MIN_VIRT                                           
                \
+(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, 
little-endian, loadrs=0 */     \
+       ;;                                                                      
                \
+(pUStk)        mov.m r24=ar.rnat;                                              
                        \
+(pUStk)        addl r22=IA64_RBS_OFFSET,r1;                    /* compute base 
of RBS */               \
+(pKStk) mov r1=sp;                                     /* get sp  */           
                \
+       ;;                                                                      
                \
+(pUStk) lfetch.fault.excl.nt1 [r22];                                           
                \
+(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base 
of memory stack */      \
+(pUStk)        mov r23=ar.bspstore;                            /* save 
ar.bspstore */                  \
+       ;;                                                                      
                \
+(pUStk)        mov ar.bspstore=r22;                            /* switch to 
kernel RBS */              \
+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                 /* if in kernel mode, 
use sp (r12) */   \
+       ;;                                                                      
                \
+(pUStk)        mov r18=ar.bsp;                                                 
                        \
+(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, 
loadrs=0 */             \
+
+#define MINSTATE_END_SAVE_MIN_VIRT                                             
                \
+       bsw.1;                  /* switch back to bank 1 (must be last in insn 
group) */        \
+       ;;
+
+/*
+ * For mca_asm.S we want to access the stack physically since the state is 
saved before we
+ * go virtual and don't want to destroy the iip or ipsr.
+ */
+#define MINSTATE_START_SAVE_MIN_PHYS                                           
                \
+(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                         
                \
+(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                   
                \
+(pKStk) ld8 r3 = [r3];;                                                        
                        \
+(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                            
                \
+(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                          
                \
+(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, 
little-endian, loadrs=0 */     \
+(pUStk)        addl r22=IA64_RBS_OFFSET,r1;            /* compute base of 
register backing store */    \
+       ;;                                                                      
                \
+(pUStk)        mov r24=ar.rnat;                                                
                        \
+(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base 
of memory stack */      \
+(pUStk)        mov r23=ar.bspstore;                            /* save 
ar.bspstore */                  \
+(pUStk)        dep r22=-1,r22,61,3;                    /* compute kernel 
virtual addr of RBS */        \
+       ;;                                                                      
                \
+(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;         /* if in kernel mode, use sp 
(r12) */           \
+(pUStk)        mov ar.bspstore=r22;                    /* switch to kernel RBS 
*/                      \
+       ;;                                                                      
                \
+(pUStk)        mov r18=ar.bsp;                                                 
                        \
+(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, 
loadrs=0 */             \
+
+#define MINSTATE_END_SAVE_MIN_PHYS                                             
                \
+       dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */  
                \
+       ;;
+
+#ifdef MINSTATE_VIRT
+# define MINSTATE_GET_CURRENT(reg)     \
+               movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
+               ld8 reg=[reg]
+# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_VIRT
+# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_VIRT
+#endif
+
+#ifdef MINSTATE_PHYS
+# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT);; tpa reg=reg
+# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_PHYS
+# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_PHYS
+#endif
+
+/*
+ * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
+ * the minimum state necessary that allows us to turn psr.ic back
+ * on.
+ *
+ * Assumed state upon entry:
+ *     psr.ic: off
+ *     r31:    contains saved predicates (pr)
+ *
+ * Upon exit, the state is as follows:
+ *     psr.ic: off
+ *      r2 = points to &pt_regs.r16
+ *      r8 = contents of ar.ccv
+ *      r9 = contents of ar.csd
+ *     r10 = contents of ar.ssd
+ *     r11 = FPSR_DEFAULT
+ *     r12 = kernel sp (kernel virtual address)
+ *     r13 = points to current task_struct (kernel virtual address)
+ *     p15 = TRUE if psr.i is set in cr.ipsr
+ *     predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
+ *             preserved
+ *
+ * Note that psr.ic is NOT turned on by this macro.  This is so that
+ * we can pass interruption state as arguments to a handler.
+ */
+#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                      
                \
+       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                       
                \
+       mov r27=ar.rsc;                 /* M */                                 
                \
+       mov r20=r1;                     /* A */                                 
                \
+       mov r25=ar.unat;                /* M */                                 
                \
+       mov r29=cr.ipsr;                /* M */                                 
                \
+       mov r26=ar.pfs;                 /* I */                                 
                \
+       mov r28=cr.iip;                 /* M */                                 
                \
+       mov r21=ar.fpsr;                /* M */                                 
                \
+       COVER;                          /* B;; (or nothing) */                  
                \
+       ;;                                                                      
                \
+       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                         
                \
+       ;;                                                                      
                \
+       ld1 r17=[r16];                          /* load 
current->thread.on_ustack flag */       \
+       st1 [r16]=r0;                           /* clear 
current->thread.on_ustack flag */      \
+       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                          
                \
+       /* switch from user to kernel RBS: */                                   
                \
+       ;;                                                                      
                \
+       invala;                         /* M */                                 
                \
+       SAVE_IFS;                                                               
                \
+       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode 
already? */            \
+       ;;                                                                      
                \
+       MINSTATE_START_SAVE_MIN                                                 
                \
+       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line 
size */           \
+       adds r16=PT(CR_IPSR),r1;                                                
                \
+       ;;                                                                      
                \
+       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                             
                \
+       st8 [r16]=r29;          /* save cr.ipsr */                              
                \
+       ;;                                                                      
                \
+       lfetch.fault.excl.nt1 [r17];                                            
                \
+       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                      
                \
+       mov r29=b0                                                              
                \
+       ;;                                                                      
                \
+       adds r16=PT(R8),r1;     /* initialize first base pointer */             
                \
+       adds r17=PT(R9),r1;     /* initialize second base pointer */            
                \
+(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */           
                        \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r8,16;                                        
                        \
+.mem.offset 8,0; st8.spill [r17]=r9,16;                                        
                        \
+        ;;                                                                     
                \
+.mem.offset 0,0; st8.spill [r16]=r10,24;                                       
                \
+.mem.offset 8,0; st8.spill [r17]=r11,24;                                       
                \
+        ;;                                                                     
                \
+       st8 [r16]=r28,16;       /* save cr.iip */                               
                \
+       st8 [r17]=r30,16;       /* save cr.ifs */                               
                \
+(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                  
                        \
+       mov r8=ar.ccv;                                                          
                \
+       mov r9=ar.csd;                                                          
                \
+       mov r10=ar.ssd;                                                         
                \
+       movl r11=FPSR_DEFAULT;   /* L-unit */                                   
                \
+       ;;                                                                      
                \
+       st8 [r16]=r25,16;       /* save ar.unat */                              
                \
+       st8 [r17]=r26,16;       /* save ar.pfs */                               
                \
+       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */    
                \
+       ;;                                                                      
                \
+       st8 [r16]=r27,16;       /* save ar.rsc */                               
                \
+(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                      
                        \
+(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */           
                        \
+       ;;                      /* avoid RAW on r16 & r17 */                    
                \
+(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                  
                        \
+       st8 [r17]=r31,16;       /* save predicates */                           
                \
+(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */       
                        \
+       ;;                                                                      
                \
+       st8 [r16]=r29,16;       /* save b0 */                                   
                \
+       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */            
                \
+       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */      
                \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */          
                \
+.mem.offset 8,0; st8.spill [r17]=r12,16;                                       
                \
+       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes 
of scratch) */  \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r13,16;                                       
                \
+.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */              
                \
+       movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;                      
                \
+       ld8 r13=[r13];                  /* establish 'current' */               
                \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r15,16;                                       
                \
+.mem.offset 8,0; st8.spill [r17]=r14,16;                                       
                \
+       ;;                                                                      
                \
+.mem.offset 0,0; st8.spill [r16]=r2,16;                                        
                        \
+.mem.offset 8,0; st8.spill [r17]=r3,16;                                        
                        \
+       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                     
                \
+       ;;                                                                      
                \
+       EXTRA;                                                                  
                \
+       movl r1=__gp;           /* establish kernel global pointer */           
                \
+       ;;                                                                      
                \
+       MINSTATE_END_SAVE_MIN
+
+/*
+ * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
+ *
+ * Assumed state upon entry:
+ *     psr.ic: on
+ *     r2:     points to &pt_regs.r16
+ *     r3:     points to &pt_regs.r17
+ *     r8:     contents of ar.ccv
+ *     r9:     contents of ar.csd
+ *     r10:    contents of ar.ssd
+ *     r11:    FPSR_DEFAULT
+ *
+ * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
+ */
+#define SAVE_REST                              \
+.mem.offset 0,0; st8.spill [r2]=r16,16;                \
+.mem.offset 8,0; st8.spill [r3]=r17,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r18,16;                \
+.mem.offset 8,0; st8.spill [r3]=r19,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r20,16;                \
+.mem.offset 8,0; st8.spill [r3]=r21,16;                \
+       mov r18=b6;                             \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r22,16;                \
+.mem.offset 8,0; st8.spill [r3]=r23,16;                \
+       mov r19=b7;                             \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r24,16;                \
+.mem.offset 8,0; st8.spill [r3]=r25,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r26,16;                \
+.mem.offset 8,0; st8.spill [r3]=r27,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r28,16;                \
+.mem.offset 8,0; st8.spill [r3]=r29,16;                \
+       ;;                                      \
+.mem.offset 0,0; st8.spill [r2]=r30,16;                \
+.mem.offset 8,0; st8.spill [r3]=r31,32;                \
+       ;;                                      \
+       mov ar.fpsr=r11;        /* M-unit */    \
+       st8 [r2]=r8,8;          /* ar.ccv */    \
+       adds r24=PT(B6)-PT(F7),r3;              \
+       ;;                                      \
+       stf.spill [r2]=f6,32;                   \
+       stf.spill [r3]=f7,32;                   \
+       ;;                                      \
+       stf.spill [r2]=f8,32;                   \
+       stf.spill [r3]=f9,32;                   \
+       ;;                                      \
+       stf.spill [r2]=f10;                     \
+       stf.spill [r3]=f11;                     \
+       adds r25=PT(B7)-PT(F11),r3;             \
+       ;;                                      \
+       st8 [r24]=r18,16;       /* b6 */        \
+       st8 [r25]=r19,16;       /* b7 */        \
+       ;;                                      \
+       st8 [r24]=r9;           /* ar.csd */    \
+       st8 [r25]=r10;          /* ar.ssd */    \
+       ;;
+
+#define SAVE_MIN_WITH_COVER    DO_SAVE_MIN(cover, mov r30=cr.ifs,)
+#define SAVE_MIN_WITH_COVER_R19        DO_SAVE_MIN(cover, mov r30=cr.ifs, mov 
r15=r19)
+#define SAVE_MIN               DO_SAVE_MIN(     , mov r30=r0, )
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/sort.c
--- /dev/null   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux-xen/sort.c    Wed Aug 31 20:32:27 2005
@@ -0,0 +1,122 @@
+/*
+ * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ *
+ * Jan 23 2005  Matt Mackall <mpm@xxxxxxxxxxx>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#ifdef XEN
+#include <linux/types.h>
+#endif
+
+void u32_swap(void *a, void *b, int size)
+{
+       u32 t = *(u32 *)a;
+       *(u32 *)a = *(u32 *)b;
+       *(u32 *)b = t;
+}
+
+void generic_swap(void *a, void *b, int size)
+{
+       char t;
+
+       do {
+               t = *(char *)a;
+               *(char *)a++ = *(char *)b;
+               *(char *)b++ = t;
+       } while (--size > 0);
+}
+
+/*
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp: pointer to comparison function
+ * @swap: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+
+void sort(void *base, size_t num, size_t size,
+         int (*cmp)(const void *, const void *),
+         void (*swap)(void *, void *, int size))
+{
+       /* pre-scale counters for performance */
+       int i = (num/2) * size, n = num * size, c, r;
+
+       if (!swap)
+               swap = (size == 4 ? u32_swap : generic_swap);
+
+       /* heapify */
+       for ( ; i >= 0; i -= size) {
+               for (r = i; r * 2 < n; r  = c) {
+                       c = r * 2;
+                       if (c < n - size && cmp(base + c, base + c + size) < 0)
+                               c += size;
+                       if (cmp(base + r, base + c) >= 0)
+                               break;
+                       swap(base + r, base + c, size);
+               }
+       }
+
+       /* sort */
+       for (i = n - size; i >= 0; i -= size) {
+               swap(base, base + i, size);
+               for (r = 0; r * 2 < i; r = c) {
+                       c = r * 2;
+                       if (c < i - size && cmp(base + c, base + c + size) < 0)
+                               c += size;
+                       if (cmp(base + r, base + c) >= 0)
+                               break;
+                       swap(base + r, base + c, size);
+               }
+       }
+}
+
+EXPORT_SYMBOL(sort);
+
+#if 0
+/* a simple boot-time regression test */
+
+int cmpint(const void *a, const void *b)
+{
+       return *(int *)a - *(int *)b;
+}
+
+static int sort_test(void)
+{
+       int *a, i, r = 1;
+
+       a = kmalloc(1000 * sizeof(int), GFP_KERNEL);
+       BUG_ON(!a);
+
+       printk("testing sort()\n");
+
+       for (i = 0; i < 1000; i++) {
+               r = (r * 725861) % 6599;
+               a[i] = r;
+       }
+
+       sort(a, 1000, sizeof(int), cmpint, NULL);
+
+       for (i = 0; i < 999; i++)
+               if (a[i] > a[i+1]) {
+                       printk("sort() failed!\n");
+                       break;
+               }
+
+       kfree(a);
+
+       return 0;
+}
+
+module_init(sort_test);
+#endif
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/README.origin
--- /dev/null   Tue Aug 30 23:51:51 2005
+++ b/xen/arch/ia64/linux/README.origin Wed Aug 31 20:32:27 2005
@@ -0,0 +1,20 @@
+Source files in this directory are identical copies of linux-2.6.13 files:
+
+cmdline.c              -> linux/lib/cmdline.c
+efi_stub.S             -> linux/arch/ia64/efi_stub.S
+extable.c              -> linux/arch/ia64/mm/extable.c
+hpsim.S                        -> linux/arch/ia64/hp/sim/hpsim.S
+ia64_ksyms.c           -> linux/arch/ia64/kernel/ia64_ksyms.c
+linuxextable.c         -> linux/kernel/extable.c
+machvec.c              -> linux/arch/ia64/kernel/machvec.c
+patch.c                        -> linux/arch/ia64/kernel/patch.c
+pcdp.h                 -> drivers/firmware/pcdp.h
+lib/bitop.c            -> linux/arch/ia64/lib/bitop.c
+lib/clear_page.S       -> linux/arch/ia64/lib/clear_page.S
+lib/copy_page_mck.S    -> linux/arch/ia64/lib/copy_page_mck.S
+lib/flush.S            -> linux/arch/ia64/lib/flush.S
+lib/idiv32.S           -> linux/arch/ia64/lib/idiv32.S
+lib/idiv64.S           -> linux/arch/ia64/lib/idiv64.S
+lib/memcpy_mck.S       -> linux/arch/ia64/lib/memcpy_mck.S
+lib/memset.S           -> linux/arch/ia64/lib/memset.S
+lib/strlen.S           -> linux/arch/ia64/lib/strlen.S
diff -r 44316ce83277 -r b7276814008c xen/include/asm-ia64/linux/sort.h
--- /dev/null   Tue Aug 30 23:51:51 2005
+++ b/xen/include/asm-ia64/linux/sort.h Wed Aug 31 20:32:27 2005
@@ -0,0 +1,10 @@
+#ifndef _LINUX_SORT_H
+#define _LINUX_SORT_H
+
+#include <linux/types.h>
+
+void sort(void *base, size_t num, size_t size,
+         int (*cmp)(const void *, const void *),
+         void (*swap)(void *, void *, int));
+
+#endif
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/lib/Makefile
--- a/xen/arch/ia64/lib/Makefile        Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-#
-# Makefile for ia64-specific library routines..
-#
-
-include $(BASEDIR)/Rules.mk
-
-OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o                  \
-       __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o                   \
-       bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \
-       clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o   \
-       flush.o ip_fast_csum.o do_csum.o copy_user.o                    \
-       memset.o strlen.o memcpy.o 
-
-default: $(OBJS)
-       $(LD) -r -o ia64lib.o $(OBJS)
-
-AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
-
-__divdi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivdi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__moddi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umoddi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-__divsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__modsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umodsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-clean:
-       rm -f *.o *~
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/Makefile
--- a/xen/arch/ia64/linux/lib/Makefile  Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-#
-# Makefile for ia64-specific library routines..
-#
-
-include $(BASEDIR)/Rules.mk
-
-OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o                  \
-       __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o                   \
-       bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \
-       clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o   \
-       flush.o ip_fast_csum.o do_csum.o copy_user.o                    \
-       memset.o strlen.o memcpy.o 
-
-default: $(OBJS)
-       $(LD) -r -o ia64lib.o $(OBJS)
-
-AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__
-
-__divdi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivdi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__moddi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umoddi3.o: idiv64.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-__divsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $<
-
-__udivsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $<
-
-__modsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $<
-
-__umodsi3.o: idiv32.S
-       $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $<
-
-clean:
-       rm -f *.o *~
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/carta_random.S
--- a/xen/arch/ia64/linux/lib/carta_random.S    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,54 +0,0 @@
-/*
- * Fast, simple, yet decent quality random number generator based on
- * a paper by David G. Carta ("Two Fast Implementations of the
- * `Minimal Standard' Random Number Generator," Communications of the
- * ACM, January, 1990).
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- *     David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-#define a      r2
-#define m      r3
-#define lo     r8
-#define hi     r9
-#define t0     r16
-#define t1     r17
-#define        seed    r32
-
-GLOBAL_ENTRY(carta_random32)
-       movl    a = (16807 << 16) | 16807
-       ;;
-       pmpyshr2.u t0 = a, seed, 0
-       pmpyshr2.u t1 = a, seed, 16
-       ;;
-       unpack2.l t0 = t1, t0
-       dep     m = -1, r0, 0, 31
-       ;;
-       zxt4    lo = t0
-       shr.u   hi = t0, 32
-       ;;
-       dep     t0 = 0, hi, 15, 49      // t0 = (hi & 0x7fff)
-       ;;
-       shl     t0 = t0, 16             // t0 = (hi & 0x7fff) << 16
-       shr     t1 = hi, 15             // t1 = (hi >> 15)
-       ;;
-       add     lo = lo, t0
-       ;;
-       cmp.gtu p6, p0 = lo, m
-       ;;
-(p6)   and     lo = lo, m
-       ;;
-(p6)   add     lo = 1, lo
-       ;;
-       add     lo = lo, t1
-       ;;
-       cmp.gtu p6, p0 = lo, m
-       ;;
-(p6)   and     lo = lo, m
-       ;;
-(p6)   add     lo = 1, lo
-       br.ret.sptk.many rp
-END(carta_random32)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/checksum.c
--- a/xen/arch/ia64/linux/lib/checksum.c        Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,102 +0,0 @@
-/*
- * Network checksum routines
- *
- * Copyright (C) 1999, 2003 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Most of the code coming from arch/alpha/lib/checksum.c
- *
- * This file contains network checksum routines that are better done
- * in an architecture-specific manner due to speed..
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/byteorder.h>
-
-static inline unsigned short
-from64to16 (unsigned long x)
-{
-       /* add up 32-bit words for 33 bits */
-       x = (x & 0xffffffff) + (x >> 32);
-       /* add up 16-bit and 17-bit words for 17+c bits */
-       x = (x & 0xffff) + (x >> 16);
-       /* add up 16-bit and 2-bit for 16+c bit */
-       x = (x & 0xffff) + (x >> 16);
-       /* add up carry.. */
-       x = (x & 0xffff) + (x >> 16);
-       return x;
-}
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented.
- */
-unsigned short int
-csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short 
len,
-                  unsigned short proto, unsigned int sum)
-{
-       return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 
16) +
-                          ((unsigned long) proto << 8));
-}
-
-EXPORT_SYMBOL(csum_tcpudp_magic);
-
-unsigned int
-csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short 
len,
-                   unsigned short proto, unsigned int sum)
-{
-       unsigned long result;
-
-       result = (saddr + daddr + sum +
-                 ((unsigned long) ntohs(len) << 16) +
-                 ((unsigned long) proto << 8));
-
-       /* Fold down to 32-bits so we don't lose in the typedef-less network 
stack.  */
-       /* 64 to 33 */
-       result = (result & 0xffffffff) + (result >> 32);
-       /* 33 to 32 */
-       result = (result & 0xffffffff) + (result >> 32);
-       return result;
-}
-
-extern unsigned long do_csum (const unsigned char *, long);
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 32-bit boundary
- */
-unsigned int
-csum_partial (const unsigned char * buff, int len, unsigned int sum)
-{
-       unsigned long result = do_csum(buff, len);
-
-       /* add in old sum, and carry.. */
-       result += sum;
-       /* 32+c bits -> 32 bits */
-       result = (result & 0xffffffff) + (result >> 32);
-       return result;
-}
-
-EXPORT_SYMBOL(csum_partial);
-
-/*
- * this routine is used for miscellaneous IP-like checksums, mainly
- * in icmp.c
- */
-unsigned short
-ip_compute_csum (unsigned char * buff, int len)
-{
-       return ~do_csum(buff,len);
-}
-
-EXPORT_SYMBOL(ip_compute_csum);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/clear_user.S
--- a/xen/arch/ia64/linux/lib/clear_user.S      Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,209 +0,0 @@
-/*
- * This routine clears to zero a linear memory buffer in user space.
- *
- * Inputs:
- *     in0:    address of buffer
- *     in1:    length of buffer in bytes
- * Outputs:
- *     r8:     number of bytes that didn't get cleared due to a fault
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-//
-// arguments
-//
-#define buf            r32
-#define len            r33
-
-//
-// local registers
-//
-#define cnt            r16
-#define buf2           r17
-#define saved_lc       r18
-#define saved_pfs      r19
-#define tmp            r20
-#define len2           r21
-#define len3           r22
-
-//
-// Theory of operations:
-//     - we check whether or not the buffer is small, i.e., less than 17
-//       in which case we do the byte by byte loop.
-//
-//     - Otherwise we go progressively from 1 byte store to 8byte store in
-//       the head part, the body is a 16byte store loop and we finish we the
-//       tail for the last 15 bytes.
-//       The good point about this breakdown is that the long buffer handling
-//       contains only 2 branches.
-//
-//     The reason for not using shifting & masking for both the head and the
-//     tail is to stay semantically correct. This routine is not supposed
-//     to write bytes outside of the buffer. While most of the time this would
-//     be ok, we can't tolerate a mistake. A classical example is the case
-//     of multithreaded code were to the extra bytes touched is actually owned
-//     by another thread which runs concurrently to ours. Another, less likely,
-//     example is with device drivers where reading an I/O mapped location may
-//     have side effects (same thing for writing).
-//
-
-GLOBAL_ENTRY(__do_clear_user)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc   saved_pfs=ar.pfs,2,0,0,0
-       cmp.eq p6,p0=r0,len             // check for zero length
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc              // preserve ar.lc (slow)
-       .body
-       ;;                              // avoid WAW on CFM
-       adds tmp=-1,len                 // br.ctop is repeat/until
-       mov ret0=len                    // return value is length at this point
-(p6)   br.ret.spnt.many rp
-       ;;
-       cmp.lt p6,p0=16,len             // if len > 16 then long memset
-       mov ar.lc=tmp                   // initialize lc for small count
-(p6)   br.cond.dptk .long_do_clear
-       ;;                              // WAR on ar.lc
-       //
-       // worst case 16 iterations, avg 8 iterations
-       //
-       // We could have played with the predicates to use the extra
-       // M slot for 2 stores/iteration but the cost the initialization
-       // the various counters compared to how long the loop is supposed
-       // to last on average does not make this solution viable.
-       //
-1:
-       EX( .Lexit1, st1 [buf]=r0,1 )
-       adds len=-1,len                 // countdown length using len
-       br.cloop.dptk 1b
-       ;;                              // avoid RAW on ar.lc
-       //
-       // .Lexit4: comes from byte by byte loop
-       //          len contains bytes left
-.Lexit1:
-       mov ret0=len                    // faster than using ar.lc
-       mov ar.lc=saved_lc
-       br.ret.sptk.many rp             // end of short clear_user
-
-
-       //
-       // At this point we know we have more than 16 bytes to copy
-       // so we focus on alignment (no branches required)
-       //
-       // The use of len/len2 for countdown of the number of bytes left
-       // instead of ret0 is due to the fact that the exception code
-       // changes the values of r8.
-       //
-.long_do_clear:
-       tbit.nz p6,p0=buf,0             // odd alignment (for long_do_clear)
-       ;;
-       EX( .Lexit3, (p6) st1 [buf]=r0,1 )      // 1-byte aligned
-(p6)   adds len=-1,len;;               // sync because buf is modified
-       tbit.nz p6,p0=buf,1
-       ;;
-       EX( .Lexit3, (p6) st2 [buf]=r0,2 )      // 2-byte aligned
-(p6)   adds len=-2,len;;
-       tbit.nz p6,p0=buf,2
-       ;;
-       EX( .Lexit3, (p6) st4 [buf]=r0,4 )      // 4-byte aligned
-(p6)   adds len=-4,len;;
-       tbit.nz p6,p0=buf,3
-       ;;
-       EX( .Lexit3, (p6) st8 [buf]=r0,8 )      // 8-byte aligned
-(p6)   adds len=-8,len;;
-       shr.u cnt=len,4         // number of 128-bit (2x64bit) words
-       ;;
-       cmp.eq p6,p0=r0,cnt
-       adds tmp=-1,cnt
-(p6)   br.cond.dpnt .dotail            // we have less than 16 bytes left
-       ;;
-       adds buf2=8,buf                 // setup second base pointer
-       mov ar.lc=tmp
-       ;;
-
-       //
-       // 16bytes/iteration core loop
-       //
-       // The second store can never generate a fault because
-       // we come into the loop only when we are 16-byte aligned.
-       // This means that if we cross a page then it will always be
-       // in the first store and never in the second.
-       //
-       //
-       // We need to keep track of the remaining length. A possible 
(optimistic)
-       // way would be to use ar.lc and derive how many byte were left by
-       // doing : left= 16*ar.lc + 16.  this would avoid the addition at
-       // every iteration.
-       // However we need to keep the synchronization point. A template
-       // M;;MB does not exist and thus we can keep the addition at no
-       // extra cycle cost (use a nop slot anyway). It also simplifies the
-       // (unlikely)  error recovery code
-       //
-
-2:     EX(.Lexit3, st8 [buf]=r0,16 )
-       ;;                              // needed to get len correct when error
-       st8 [buf2]=r0,16
-       adds len=-16,len
-       br.cloop.dptk 2b
-       ;;
-       mov ar.lc=saved_lc
-       //
-       // tail correction based on len only
-       //
-       // We alternate the use of len3,len2 to allow parallelism and correct
-       // error handling. We also reuse p6/p7 to return correct value.
-       // The addition of len2/len3 does not cost anything more compared to
-       // the regular memset as we had empty slots.
-       //
-.dotail:
-       mov len2=len                    // for parallelization of error handling
-       mov len3=len
-       tbit.nz p6,p0=len,3
-       ;;
-       EX( .Lexit2, (p6) st8 [buf]=r0,8 )      // at least 8 bytes
-(p6)   adds len3=-8,len2
-       tbit.nz p7,p6=len,2
-       ;;
-       EX( .Lexit2, (p7) st4 [buf]=r0,4 )      // at least 4 bytes
-(p7)   adds len2=-4,len3
-       tbit.nz p6,p7=len,1
-       ;;
-       EX( .Lexit2, (p6) st2 [buf]=r0,2 )      // at least 2 bytes
-(p6)   adds len3=-2,len2
-       tbit.nz p7,p6=len,0
-       ;;
-       EX( .Lexit2, (p7) st1 [buf]=r0 )        // only 1 byte left
-       mov ret0=r0                             // success
-       br.ret.sptk.many rp                     // end of most likely path
-
-       //
-       // Outlined error handling code
-       //
-
-       //
-       // .Lexit3: comes from core loop, need restore pr/lc
-       //          len contains bytes left
-       //
-       //
-       // .Lexit2:
-       //      if p6 -> coming from st8 or st2 : len2 contains what's left
-       //      if p7 -> coming from st4 or st1 : len3 contains what's left
-       // We must restore lc/pr even though might not have been used.
-.Lexit2:
-       .pred.rel "mutex", p6, p7
-(p6)   mov len=len2
-(p7)   mov len=len3
-       ;;
-       //
-       // .Lexit4: comes from head, need not restore pr/lc
-       //          len contains bytes left
-       //
-.Lexit3:
-       mov ret0=len
-       mov ar.lc=saved_lc
-       br.ret.sptk.many rp
-END(__do_clear_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/copy_page.S
--- a/xen/arch/ia64/linux/lib/copy_page.S       Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,98 +0,0 @@
-/*
- *
- * Optimized version of the standard copy_page() function
- *
- * Inputs:
- *     in0:    address of target page
- *     in1:    address of source page
- * Output:
- *     no return value
- *
- * Copyright (C) 1999, 2001 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *     David Mosberger <davidm@xxxxxxxxxx>
- *
- * 4/06/01 davidm      Tuned to make it perform well both for cached and 
uncached copies.
- */
-#include <asm/asmmacro.h>
-#include <asm/page.h>
-
-#define PIPE_DEPTH     3
-#define EPI            p[PIPE_DEPTH-1]
-
-#define lcount         r16
-#define saved_pr       r17
-#define saved_lc       r18
-#define saved_pfs      r19
-#define src1           r20
-#define src2           r21
-#define tgt1           r22
-#define tgt2           r23
-#define srcf           r24
-#define tgtf           r25
-#define tgt_last       r26
-
-#define Nrot           ((8*PIPE_DEPTH+7)&~7)
-
-GLOBAL_ENTRY(copy_page)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
-
-       .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
-             t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
-       .rotp p[PIPE_DEPTH]
-
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc
-       mov ar.ec=PIPE_DEPTH
-
-       mov lcount=PAGE_SIZE/64-1
-       .save pr, saved_pr
-       mov saved_pr=pr
-       mov pr.rot=1<<16
-
-       .body
-
-       mov src1=in1
-       adds src2=8,in1
-       mov tgt_last = PAGE_SIZE
-       ;;
-       adds tgt2=8,in0
-       add srcf=512,in1
-       mov ar.lc=lcount
-       mov tgt1=in0
-       add tgtf=512,in0
-       add tgt_last = tgt_last, in0
-       ;;
-1:
-(p[0]) ld8 t1[0]=[src1],16
-(EPI)  st8 [tgt1]=t1[PIPE_DEPTH-1],16
-(p[0]) ld8 t2[0]=[src2],16
-(EPI)  st8 [tgt2]=t2[PIPE_DEPTH-1],16
-       cmp.ltu p6,p0 = tgtf, tgt_last
-       ;;
-(p[0]) ld8 t3[0]=[src1],16
-(EPI)  st8 [tgt1]=t3[PIPE_DEPTH-1],16
-(p[0]) ld8 t4[0]=[src2],16
-(EPI)  st8 [tgt2]=t4[PIPE_DEPTH-1],16
-       ;;
-(p[0]) ld8 t5[0]=[src1],16
-(EPI)  st8 [tgt1]=t5[PIPE_DEPTH-1],16
-(p[0]) ld8 t6[0]=[src2],16
-(EPI)  st8 [tgt2]=t6[PIPE_DEPTH-1],16
-       ;;
-(p[0]) ld8 t7[0]=[src1],16
-(EPI)  st8 [tgt1]=t7[PIPE_DEPTH-1],16
-(p[0]) ld8 t8[0]=[src2],16
-(EPI)  st8 [tgt2]=t8[PIPE_DEPTH-1],16
-
-(p6)   lfetch [srcf], 64
-(p6)   lfetch [tgtf], 64
-       br.ctop.sptk.few 1b
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000      // restore predicates
-       mov ar.pfs=saved_pfs
-       mov ar.lc=saved_lc
-       br.ret.sptk.many rp
-END(copy_page)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/copy_user.S
--- a/xen/arch/ia64/linux/lib/copy_user.S       Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,610 +0,0 @@
-/*
- *
- * Optimized version of the copy_user() routine.
- * It is used to copy date across the kernel/user boundary.
- *
- * The source and destination are always on opposite side of
- * the boundary. When reading from user space we must catch
- * faults on loads. When writing to user space we must catch
- * errors on stores. Note that because of the nature of the copy
- * we don't need to worry about overlapping regions.
- *
- *
- * Inputs:
- *     in0     address of source buffer
- *     in1     address of destination buffer
- *     in2     number of bytes to copy
- *
- * Outputs:
- *     ret0    0 in case of success. The number of bytes NOT copied in
- *             case of error.
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Fixme:
- *     - handle the case where we have more than 16 bytes and the alignment
- *       are different.
- *     - more benchmarking
- *     - fix extraneous stop bit introduced by the EX() macro.
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Tuneable parameters
-//
-#define COPY_BREAK     16      // we do byte copy below (must be >=16)
-#define PIPE_DEPTH     21      // pipe depth
-
-#define EPI            p[PIPE_DEPTH-1]
-
-//
-// arguments
-//
-#define dst            in0
-#define src            in1
-#define len            in2
-
-//
-// local registers
-//
-#define t1             r2      // rshift in bytes
-#define t2             r3      // lshift in bytes
-#define rshift         r14     // right shift in bits
-#define lshift         r15     // left shift in bits
-#define word1          r16
-#define word2          r17
-#define cnt            r18
-#define len2           r19
-#define saved_lc       r20
-#define saved_pr       r21
-#define tmp            r22
-#define val            r23
-#define src1           r24
-#define dst1           r25
-#define src2           r26
-#define dst2           r27
-#define len1           r28
-#define enddst         r29
-#define endsrc         r30
-#define saved_pfs      r31
-
-GLOBAL_ENTRY(__copy_user)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
-
-       .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
-       .rotp p[PIPE_DEPTH]
-
-       adds len2=-1,len        // br.ctop is repeat/until
-       mov ret0=r0
-
-       ;;                      // RAW of cfm when len=0
-       cmp.eq p8,p0=r0,len     // check for zero length
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc      // preserve ar.lc (slow)
-(p8)   br.ret.spnt.many rp     // empty mempcy()
-       ;;
-       add enddst=dst,len      // first byte after end of source
-       add endsrc=src,len      // first byte after end of destination
-       .save pr, saved_pr
-       mov saved_pr=pr         // preserve predicates
-
-       .body
-
-       mov dst1=dst            // copy because of rotation
-       mov ar.ec=PIPE_DEPTH
-       mov pr.rot=1<<16        // p16=true all others are false
-
-       mov src1=src            // copy because of rotation
-       mov ar.lc=len2          // initialize lc for small count
-       cmp.lt p10,p7=COPY_BREAK,len    // if len > COPY_BREAK then long copy
-
-       xor tmp=src,dst         // same alignment test prepare
-(p10)  br.cond.dptk .long_copy_user
-       ;;                      // RAW pr.rot/p16 ?
-       //
-       // Now we do the byte by byte loop with software pipeline
-       //
-       // p7 is necessarily false by now
-1:
-       EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-       EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-       br.ctop.dptk.few 1b
-       ;;
-       mov ar.lc=saved_lc
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.pfs=saved_pfs            // restore ar.ec
-       br.ret.sptk.many rp             // end of short memcpy
-
-       //
-       // Not 8-byte aligned
-       //
-.diff_align_copy_user:
-       // At this point we know we have more than 16 bytes to copy
-       // and also that src and dest do _not_ have the same alignment.
-       and src2=0x7,src1                               // src offset
-       and dst2=0x7,dst1                               // dst offset
-       ;;
-       // The basic idea is that we copy byte-by-byte at the head so
-       // that we can reach 8-byte alignment for both src1 and dst1.
-       // Then copy the body using software pipelined 8-byte copy,
-       // shifting the two back-to-back words right and left, then copy
-       // the tail by copying byte-by-byte.
-       //
-       // Fault handling. If the byte-by-byte at the head fails on the
-       // load, then restart and finish the pipleline by copying zeros
-       // to the dst1. Then copy zeros for the rest of dst1.
-       // If 8-byte software pipeline fails on the load, do the same as
-       // failure_in3 does. If the byte-by-byte at the tail fails, it is
-       // handled simply by failure_in_pipe1.
-       //
-       // The case p14 represents the source has more bytes in the
-       // the first word (by the shifted part), whereas the p15 needs to
-       // copy some bytes from the 2nd word of the source that has the
-       // tail of the 1st of the destination.
-       //
-
-       //
-       // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
-       // to copy the head to dst1, to start 8-byte copy software pipeline.
-       // We know src1 is not 8-byte aligned in this case.
-       //
-       cmp.eq p14,p15=r0,dst2
-(p15)  br.cond.spnt 1f
-       ;;
-       sub t1=8,src2
-       mov t2=src2
-       ;;
-       shl rshift=t2,3
-       sub len1=len,t1                                 // set len1
-       ;;
-       sub lshift=64,rshift
-       ;;
-       br.cond.spnt .word_copy_user
-       ;;
-1:
-       cmp.leu p14,p15=src2,dst2
-       sub t1=dst2,src2
-       ;;
-       .pred.rel "mutex", p14, p15
-(p14)  sub word1=8,src2                                // (8 - src offset)
-(p15)  sub t1=r0,t1                                    // absolute value
-(p15)  sub word1=8,dst2                                // (8 - dst offset)
-       ;;
-       // For the case p14, we don't need to copy the shifted part to
-       // the 1st word of destination.
-       sub t2=8,t1
-(p14)  sub word1=word1,t1
-       ;;
-       sub len1=len,word1                              // resulting len
-(p15)  shl rshift=t1,3                                 // in bits
-(p14)  shl rshift=t2,3
-       ;;
-(p14)  sub len1=len1,t1
-       adds cnt=-1,word1
-       ;;
-       sub lshift=64,rshift
-       mov ar.ec=PIPE_DEPTH
-       mov pr.rot=1<<16        // p16=true all others are false
-       mov ar.lc=cnt
-       ;;
-2:
-       EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
-       EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-       br.ctop.dptk.few 2b
-       ;;
-       clrrrb
-       ;;
-.word_copy_user:
-       cmp.gtu p9,p0=16,len1
-(p9)   br.cond.spnt 4f                 // if (16 > len1) skip 8-byte copy
-       ;;
-       shr.u cnt=len1,3                // number of 64-bit words
-       ;;
-       adds cnt=-1,cnt
-       ;;
-       .pred.rel "mutex", p14, p15
-(p14)  sub src1=src1,t2
-(p15)  sub src1=src1,t1
-       //
-       // Now both src1 and dst1 point to an 8-byte aligned address. And
-       // we have more than 8 bytes to copy.
-       //
-       mov ar.lc=cnt
-       mov ar.ec=PIPE_DEPTH
-       mov pr.rot=1<<16        // p16=true all others are false
-       ;;
-3:
-       //
-       // The pipleline consists of 3 stages:
-       // 1 (p16):     Load a word from src1
-       // 2 (EPI_1):   Shift right pair, saving to tmp
-       // 3 (EPI):     Store tmp to dst1
-       //
-       // To make it simple, use at least 2 (p16) loops to set up val1[n]
-       // because we need 2 back-to-back val1[] to get tmp.
-       // Note that this implies EPI_2 must be p18 or greater.
-       //
-
-#define EPI_1          p[PIPE_DEPTH-2]
-#define SWITCH(pred, shift)    cmp.eq pred,p0=shift,rshift
-#define CASE(pred, shift)      \
-       (pred)  br.cond.spnt .copy_user_bit##shift
-#define BODY(rshift)                                           \
-.copy_user_bit##rshift:                                                \
-1:                                                             \
-       EX(.failure_out,(EPI) st8 [dst1]=tmp,8);                \
-(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
-       EX(3f,(p16) ld8 val1[1]=[src1],8);                      \
-(p16)  mov val1[0]=r0;                                         \
-       br.ctop.dptk 1b;                                        \
-       ;;                                                      \
-       br.cond.sptk.many .diff_align_do_tail;                  \
-2:                                                             \
-(EPI)  st8 [dst1]=tmp,8;                                       \
-(EPI_1)        shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift;  \
-3:                                                             \
-(p16)  mov val1[1]=r0;                                         \
-(p16)  mov val1[0]=r0;                                         \
-       br.ctop.dptk 2b;                                        \
-       ;;                                                      \
-       br.cond.sptk.many .failure_in2
-
-       //
-       // Since the instruction 'shrp' requires a fixed 128-bit value
-       // specifying the bits to shift, we need to provide 7 cases
-       // below.
-       //
-       SWITCH(p6, 8)
-       SWITCH(p7, 16)
-       SWITCH(p8, 24)
-       SWITCH(p9, 32)
-       SWITCH(p10, 40)
-       SWITCH(p11, 48)
-       SWITCH(p12, 56)
-       ;;
-       CASE(p6, 8)
-       CASE(p7, 16)
-       CASE(p8, 24)
-       CASE(p9, 32)
-       CASE(p10, 40)
-       CASE(p11, 48)
-       CASE(p12, 56)
-       ;;
-       BODY(8)
-       BODY(16)
-       BODY(24)
-       BODY(32)
-       BODY(40)
-       BODY(48)
-       BODY(56)
-       ;;
-.diff_align_do_tail:
-       .pred.rel "mutex", p14, p15
-(p14)  sub src1=src1,t1
-(p14)  adds dst1=-8,dst1
-(p15)  sub dst1=dst1,t1
-       ;;
-4:
-       // Tail correction.
-       //
-       // The problem with this piplelined loop is that the last word is not
-       // loaded and thus parf of the last word written is not correct.
-       // To fix that, we simply copy the tail byte by byte.
-
-       sub len1=endsrc,src1,1
-       clrrrb
-       ;;
-       mov ar.ec=PIPE_DEPTH
-       mov pr.rot=1<<16        // p16=true all others are false
-       mov ar.lc=len1
-       ;;
-5:
-       EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
-       EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
-       br.ctop.dptk.few 5b
-       ;;
-       mov ar.lc=saved_lc
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       //
-       // Beginning of long mempcy (i.e. > 16 bytes)
-       //
-.long_copy_user:
-       tbit.nz p6,p7=src1,0    // odd alignment
-       and tmp=7,tmp
-       ;;
-       cmp.eq p10,p8=r0,tmp
-       mov len1=len            // copy because of rotation
-(p8)   br.cond.dpnt .diff_align_copy_user
-       ;;
-       // At this point we know we have more than 16 bytes to copy
-       // and also that both src and dest have the same alignment
-       // which may not be the one we want. So for now we must move
-       // forward slowly until we reach 16byte alignment: no need to
-       // worry about reaching the end of buffer.
-       //
-       EX(.failure_in1,(p6) ld1 val1[0]=[src1],1)      // 1-byte aligned
-(p6)   adds len1=-1,len1;;
-       tbit.nz p7,p0=src1,1
-       ;;
-       EX(.failure_in1,(p7) ld2 val1[1]=[src1],2)      // 2-byte aligned
-(p7)   adds len1=-2,len1;;
-       tbit.nz p8,p0=src1,2
-       ;;
-       //
-       // Stop bit not required after ld4 because if we fail on ld4
-       // we have never executed the ld1, therefore st1 is not executed.
-       //
-       EX(.failure_in1,(p8) ld4 val2[0]=[src1],4)      // 4-byte aligned
-       ;;
-       EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
-       tbit.nz p9,p0=src1,3
-       ;;
-       //
-       // Stop bit not required after ld8 because if we fail on ld8
-       // we have never executed the ld2, therefore st2 is not executed.
-       //
-       EX(.failure_in1,(p9) ld8 val2[1]=[src1],8)      // 8-byte aligned
-       EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
-(p8)   adds len1=-4,len1
-       ;;
-       EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
-(p9)   adds len1=-8,len1;;
-       shr.u cnt=len1,4                // number of 128-bit (2x64bit) words
-       ;;
-       EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
-       tbit.nz p6,p0=len1,3
-       cmp.eq p7,p0=r0,cnt
-       adds tmp=-1,cnt                 // br.ctop is repeat/until
-(p7)   br.cond.dpnt .dotail            // we have less than 16 bytes left
-       ;;
-       adds src2=8,src1
-       adds dst2=8,dst1
-       mov ar.lc=tmp
-       ;;
-       //
-       // 16bytes/iteration
-       //
-2:
-       EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
-(p16)  ld8 val2[0]=[src2],16
-
-       EX(.failure_out, (EPI)  st8 [dst1]=val1[PIPE_DEPTH-1],16)
-(EPI)  st8 [dst2]=val2[PIPE_DEPTH-1],16
-       br.ctop.dptk 2b
-       ;;                      // RAW on src1 when fall through from loop
-       //
-       // Tail correction based on len only
-       //
-       // No matter where we come from (loop or test) the src1 pointer
-       // is 16 byte aligned AND we have less than 16 bytes to copy.
-       //
-.dotail:
-       EX(.failure_in1,(p6) ld8 val1[0]=[src1],8)      // at least 8 bytes
-       tbit.nz p7,p0=len1,2
-       ;;
-       EX(.failure_in1,(p7) ld4 val1[1]=[src1],4)      // at least 4 bytes
-       tbit.nz p8,p0=len1,1
-       ;;
-       EX(.failure_in1,(p8) ld2 val2[0]=[src1],2)      // at least 2 bytes
-       tbit.nz p9,p0=len1,0
-       ;;
-       EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
-       ;;
-       EX(.failure_in1,(p9) ld1 val2[1]=[src1])        // only 1 byte left
-       mov ar.lc=saved_lc
-       ;;
-       EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
-       mov pr=saved_pr,0xffffffffffff0000
-       ;;
-       EX(.failure_out, (p8)   st2 [dst1]=val2[0],2)
-       mov ar.pfs=saved_pfs
-       ;;
-       EX(.failure_out, (p9)   st1 [dst1]=val2[1])
-       br.ret.sptk.many rp
-
-
-       //
-       // Here we handle the case where the byte by byte copy fails
-       // on the load.
-       // Several factors make the zeroing of the rest of the buffer kind of
-       // tricky:
-       //      - the pipeline: loads/stores are not in sync (pipeline)
-       //
-       //        In the same loop iteration, the dst1 pointer does not directly
-       //        reflect where the faulty load was.
-       //
-       //      - pipeline effect
-       //        When you get a fault on load, you may have valid data from
-       //        previous loads not yet store in transit. Such data must be
-       //        store normally before moving onto zeroing the rest.
-       //
-       //      - single/multi dispersal independence.
-       //
-       // solution:
-       //      - we don't disrupt the pipeline, i.e. data in transit in
-       //        the software pipeline will be eventually move to memory.
-       //        We simply replace the load with a simple mov and keep the
-       //        pipeline going. We can't really do this inline because
-       //        p16 is always reset to 1 when lc > 0.
-       //
-.failure_in_pipe1:
-       sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
-1:
-(p16)  mov val1[0]=r0
-(EPI)  st1 [dst1]=val1[PIPE_DEPTH-1],1
-       br.ctop.dptk 1b
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       //
-       // This is the case where the byte by byte copy fails on the load
-       // when we copy the head. We need to finish the pipeline and copy
-       // zeros for the rest of the destination. Since this happens
-       // at the top we still need to fill the body and tail.
-.failure_in_pipe2:
-       sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
-2:
-(p16)  mov val1[0]=r0
-(EPI)  st1 [dst1]=val1[PIPE_DEPTH-1],1
-       br.ctop.dptk 2b
-       ;;
-       sub len=enddst,dst1,1           // precompute len
-       br.cond.dptk.many .failure_in1bis
-       ;;
-
-       //
-       // Here we handle the head & tail part when we check for alignment.
-       // The following code handles only the load failures. The
-       // main diffculty comes from the fact that loads/stores are
-       // scheduled. So when you fail on a load, the stores corresponding
-       // to previous successful loads must be executed.
-       //
-       // However some simplifications are possible given the way
-       // things work.
-       //
-       // 1) HEAD
-       // Theory of operation:
-       //
-       //  Page A   | Page B
-       //  ---------|-----
-       //          1|8 x
-       //        1 2|8 x
-       //          4|8 x
-       //        1 4|8 x
-       //        2 4|8 x
-       //      1 2 4|8 x
-       //           |1
-       //           |2 x
-       //           |4 x
-       //
-       // page_size >= 4k (2^12).  (x means 4, 2, 1)
-       // Here we suppose Page A exists and Page B does not.
-       //
-       // As we move towards eight byte alignment we may encounter faults.
-       // The numbers on each page show the size of the load (current 
alignment).
-       //
-       // Key point:
-       //      - if you fail on 1, 2, 4 then you have never executed any 
smaller
-       //        size loads, e.g. failing ld4 means no ld1 nor ld2 executed
-       //        before.
-       //
-       // This allows us to simplify the cleanup code, because basically you
-       // only have to worry about "pending" stores in the case of a failing
-       // ld8(). Given the way the code is written today, this means only
-       // worry about st2, st4. There we can use the information encapsulated
-       // into the predicates.
-       //
-       // Other key point:
-       //      - if you fail on the ld8 in the head, it means you went straight
-       //        to it, i.e. 8byte alignment within an unexisting page.
-       // Again this comes from the fact that if you crossed just for the ld8 
then
-       // you are 8byte aligned but also 16byte align, therefore you would
-       // either go for the 16byte copy loop OR the ld8 in the tail part.
-       // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is 
impossible
-       // because it would mean you had 15bytes to copy in which case you
-       // would have defaulted to the byte by byte copy.
-       //
-       //
-       // 2) TAIL
-       // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
-       // aligned.
-       //
-       // Key point:
-       // This means that we either:
-       //              - are right on a page boundary
-       //      OR
-       //              - are at more than 16 bytes from a page boundary with
-       //                at most 15 bytes to copy: no chance of crossing.
-       //
-       // This allows us to assume that if we fail on a load we haven't 
possibly
-       // executed any of the previous (tail) ones, so we don't need to do
-       // any stores. For instance, if we fail on ld2, this means we had
-       // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
-       //
-       // This means that we are in a situation similar the a fault in the
-       // head part. That's nice!
-       //
-.failure_in1:
-       sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
-       sub len=endsrc,src1,1
-       //
-       // we know that ret0 can never be zero at this point
-       // because we failed why trying to do a load, i.e. there is still
-       // some work to do.
-       // The failure_in1bis and length problem is taken care of at the
-       // calling side.
-       //
-       ;;
-.failure_in1bis:               // from (.failure_in3)
-       mov ar.lc=len           // Continue with a stupid byte store.
-       ;;
-5:
-       st1 [dst1]=r0,1
-       br.cloop.dptk 5b
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       //
-       // Here we simply restart the loop but instead
-       // of doing loads we fill the pipeline with zeroes
-       // We can't simply store r0 because we may have valid
-       // data in transit in the pipeline.
-       // ar.lc and ar.ec are setup correctly at this point
-       //
-       // we MUST use src1/endsrc here and not dst1/enddst because
-       // of the pipeline effect.
-       //
-.failure_in3:
-       sub ret0=endsrc,src1    // number of bytes to zero, i.e. not copied
-       ;;
-2:
-(p16)  mov val1[0]=r0
-(p16)  mov val2[0]=r0
-(EPI)  st8 [dst1]=val1[PIPE_DEPTH-1],16
-(EPI)  st8 [dst2]=val2[PIPE_DEPTH-1],16
-       br.ctop.dptk 2b
-       ;;
-       cmp.ne p6,p0=dst1,enddst        // Do we need to finish the tail ?
-       sub len=enddst,dst1,1           // precompute len
-(p6)   br.cond.dptk .failure_in1bis
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-.failure_in2:
-       sub ret0=endsrc,src1
-       cmp.ne p6,p0=dst1,enddst        // Do we need to finish the tail ?
-       sub len=enddst,dst1,1           // precompute len
-(p6)   br.cond.dptk .failure_in1bis
-       ;;
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       //
-       // handling of failures on stores: that's the easy part
-       //
-.failure_out:
-       sub ret0=enddst,dst1
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.lc=saved_lc
-
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-END(__copy_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/csum_partial_copy.c
--- a/xen/arch/ia64/linux/lib/csum_partial_copy.c       Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,151 +0,0 @@
-/*
- * Network Checksum & Copy routine
- *
- * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * Most of the code has been imported from Linux/Alpha
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-
-#include <asm/uaccess.h>
-
-/*
- * XXX Fixme: those 2 inlines are meant for debugging and will go away
- */
-static inline unsigned
-short from64to16(unsigned long x)
-{
-       /* add up 32-bit words for 33 bits */
-       x = (x & 0xffffffff) + (x >> 32);
-       /* add up 16-bit and 17-bit words for 17+c bits */
-       x = (x & 0xffff) + (x >> 16);
-       /* add up 16-bit and 2-bit for 16+c bit */
-       x = (x & 0xffff) + (x >> 16);
-       /* add up carry.. */
-       x = (x & 0xffff) + (x >> 16);
-       return x;
-}
-
-static inline
-unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
-{
-       int odd, count;
-       unsigned long result = (unsigned long)psum;
-
-       if (len <= 0)
-               goto out;
-       odd = 1 & (unsigned long) buff;
-       if (odd) {
-               result = *buff << 8;
-               len--;
-               buff++;
-       }
-       count = len >> 1;               /* nr of 16-bit words.. */
-       if (count) {
-               if (2 & (unsigned long) buff) {
-                       result += *(unsigned short *) buff;
-                       count--;
-                       len -= 2;
-                       buff += 2;
-               }
-               count >>= 1;            /* nr of 32-bit words.. */
-               if (count) {
-                       if (4 & (unsigned long) buff) {
-                               result += *(unsigned int *) buff;
-                               count--;
-                               len -= 4;
-                               buff += 4;
-                       }
-                       count >>= 1;    /* nr of 64-bit words.. */
-                       if (count) {
-                               unsigned long carry = 0;
-                               do {
-                                       unsigned long w = *(unsigned long *) 
buff;
-                                       count--;
-                                       buff += 8;
-                                       result += carry;
-                                       result += w;
-                                       carry = (w > result);
-                               } while (count);
-                               result += carry;
-                               result = (result & 0xffffffff) + (result >> 32);
-                       }
-                       if (len & 4) {
-                               result += *(unsigned int *) buff;
-                               buff += 4;
-                       }
-               }
-               if (len & 2) {
-                       result += *(unsigned short *) buff;
-                       buff += 2;
-               }
-       }
-       if (len & 1)
-               result += *buff;
-
-       result = from64to16(result);
-
-       if (odd)
-               result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-
-out:
-       return result;
-}
-
-/*
- * XXX Fixme
- *
- * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
- * But it's very tricky to get right even in C.
- */
-extern unsigned long do_csum(const unsigned char *, long);
-
-static unsigned int
-do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char 
*dst,
-                               int len, unsigned int psum, int *errp)
-{
-       unsigned long result;
-
-       /* XXX Fixme
-        * for now we separate the copy from checksum for obvious
-        * alignment difficulties. Look at the Alpha code and you'll be
-        * scared.
-        */
-
-       if (__copy_from_user(dst, src, len) != 0 && errp)
-               *errp = -EFAULT;
-
-       result = do_csum(dst, len);
-
-       /* add in old sum, and carry.. */
-       result += psum;
-       /* 32+c bits -> 32 bits */
-       result = (result & 0xffffffff) + (result >> 32);
-       return result;
-}
-
-unsigned int
-csum_partial_copy_from_user (const unsigned char __user *src, unsigned char 
*dst,
-                            int len, unsigned int sum, int *errp)
-{
-       if (!access_ok(VERIFY_READ, src, len)) {
-               *errp = -EFAULT;
-               memset(dst, 0, len);
-               return sum;
-       }
-
-       return do_csum_partial_copy_from_user(src, dst, len, sum, errp);
-}
-
-unsigned int
-csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst,
-                         int len, unsigned int sum)
-{
-       return do_csum_partial_copy_from_user(src, dst, len, sum, NULL);
-}
-
-EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/dec_and_lock.c
--- a/xen/arch/ia64/linux/lib/dec_and_lock.c    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2003 Jerome Marchand, Bull S.A.
- *     Cleaned up by David Mosberger-Tang <davidm@xxxxxxxxxx>
- *
- * This file is released under the GPLv2, or at your option any later version.
- *
- * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" 
instruction.  This
- * code is an adaptation of the x86 version of "atomic_dec_and_lock()".
- */
-
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/atomic.h>
-
-/*
- * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock.  
Both of these
- * operations have to be done atomically, so that the count doesn't drop to 
zero without
- * acquiring the spinlock first.
- */
-int
-_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock)
-{
-       int old, new;
-
-       do {
-               old = atomic_read(refcount);
-               new = old - 1;
-
-               if (unlikely (old == 1)) {
-                       /* oops, we may be decrementing to zero, do it the slow 
way... */
-                       spin_lock(lock);
-                       if (atomic_dec_and_test(refcount))
-                               return 1;
-                       spin_unlock(lock);
-                       return 0;
-               }
-       } while (cmpxchg(&refcount->counter, old, new) != old);
-       return 0;
-}
-
-EXPORT_SYMBOL(_atomic_dec_and_lock);
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/do_csum.S
--- a/xen/arch/ia64/linux/lib/do_csum.S Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,323 +0,0 @@
-/*
- *
- * Optmized version of the standard do_csum() function
- *
- * Return: a 64bit quantity containing the 16bit Internet checksum
- *
- * Inputs:
- *     in0: address of buffer to checksum (char *)
- *     in1: length of the buffer (int)
- *
- * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * 02/04/22    Ken Chen <kenneth.w.chen@xxxxxxxxx>
- *             Data locality study on the checksum buffer.
- *             More optimization cleanup - remove excessive stop bits.
- * 02/04/08    David Mosberger <davidm@xxxxxxxxxx>
- *             More cleanup and tuning.
- * 01/04/18    Jun Nakajima <jun.nakajima@xxxxxxxxx>
- *             Clean up and optimize and the software pipeline, loading two
- *             back-to-back 8-byte words per loop. Clean up the initialization
- *             for the loop. Support the cases where load latency = 1 or 2.
- *             Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
- */
-
-#include <asm/asmmacro.h>
-
-//
-// Theory of operations:
-//     The goal is to go as quickly as possible to the point where
-//     we can checksum 16 bytes/loop. Before reaching that point we must
-//     take care of incorrect alignment of first byte.
-//
-//     The code hereafter also takes care of the "tail" part of the buffer
-//     before entering the core loop, if any. The checksum is a sum so it
-//     allows us to commute operations. So we do the "head" and "tail"
-//     first to finish at full speed in the body. Once we get the head and
-//     tail values, we feed them into the pipeline, very handy initialization.
-//
-//     Of course we deal with the special case where the whole buffer fits
-//     into one 8 byte word. In this case we have only one entry in the 
pipeline.
-//
-//     We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
-//     possible load latency and also to accommodate for head and tail.
-//
-//     The end of the function deals with folding the checksum from 64bits
-//     down to 16bits taking care of the carry.
-//
-//     This version avoids synchronization in the core loop by also using a
-//     pipeline for the accumulation of the checksum in resultx[] (x=1,2).
-//
-//      wordx[] (x=1,2)
-//     |---|
-//      |   | 0                        : new value loaded in pipeline
-//     |---|
-//      |   | -                        : in transit data
-//     |---|
-//      |   | LOAD_LATENCY     : current value to add to checksum
-//     |---|
-//      |   | LOAD_LATENCY+1   : previous value added to checksum
-//      |---|                  (previous iteration)
-//
-//     resultx[] (x=1,2)
-//     |---|
-//      |   | 0                        : initial value
-//     |---|
-//      |   | LOAD_LATENCY-1   : new checksum
-//     |---|
-//      |   | LOAD_LATENCY     : previous value of checksum
-//     |---|
-//      |   | LOAD_LATENCY+1   : final checksum when out of the loop
-//      |---|
-//
-//
-//     See RFC1071 "Computing the Internet Checksum" for various techniques for
-//     calculating the Internet checksum.
-//
-// NOT YET DONE:
-//     - Maybe another algorithm which would take care of the folding at the
-//       end in a different manner
-//     - Work with people more knowledgeable than me on the network stack
-//       to figure out if we could not split the function depending on the
-//       type of packet or alignment we get. Like the ip_fast_csum() routine
-//       where we know we have at least 20bytes worth of data to checksum.
-//     - Do a better job of handling small packets.
-//     - Note on prefetching: it was found that under various load, i.e. ftp 
read/write,
-//       nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate 
is at 99.8%
-//       on the data that buffer points to (partly because the checksum is 
often preceded by
-//       a copy_from_user()).  This finding indiate that lfetch will not be 
beneficial since
-//       the data is already in the cache.
-//
-
-#define saved_pfs      r11
-#define hmask          r16
-#define tmask          r17
-#define first1         r18
-#define firstval       r19
-#define firstoff       r20
-#define last           r21
-#define lastval                r22
-#define lastoff                r23
-#define saved_lc       r24
-#define saved_pr       r25
-#define tmp1           r26
-#define tmp2           r27
-#define tmp3           r28
-#define carry1         r29
-#define carry2         r30
-#define first2         r31
-
-#define buf            in0
-#define len            in1
-
-#define LOAD_LATENCY   2       // XXX fix me
-
-#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
-# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
-#endif
-
-#define PIPE_DEPTH                     (LOAD_LATENCY+2)
-#define ELD    p[LOAD_LATENCY]         // end of load
-#define ELD_1  p[LOAD_LATENCY+1]       // and next stage
-
-// unsigned long do_csum(unsigned char *buf,long len)
-
-GLOBAL_ENTRY(do_csum)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,2,16,0,16
-       .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
-       .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
-       mov ret0=r0             // in case we have zero length
-       cmp.lt p0,p6=r0,len     // check for zero length or negative (32bit len)
-       ;;
-       add tmp1=buf,len        // last byte's address
-       .save pr, saved_pr
-       mov saved_pr=pr         // preserve predicates (rotation)
-(p6)   br.ret.spnt.many rp     // return if zero or negative length
-
-       mov hmask=-1            // initialize head mask
-       tbit.nz p15,p0=buf,0    // is buf an odd address?
-       and first1=-8,buf       // 8-byte align down address of first1 element
-
-       and firstoff=7,buf      // how many bytes off for first1 element
-       mov tmask=-1            // initialize tail mask
-
-       ;;
-       adds tmp2=-1,tmp1       // last-1
-       and lastoff=7,tmp1      // how many bytes off for last element
-       ;;
-       sub tmp1=8,lastoff      // complement to lastoff
-       and last=-8,tmp2        // address of word containing last byte
-       ;;
-       sub tmp3=last,first1    // tmp3=distance from first1 to last
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc      // save lc
-       cmp.eq p8,p9=last,first1        // everything fits in one word ?
-
-       ld8 firstval=[first1],8 // load, ahead of time, "first1" word
-       and tmp1=7, tmp1        // make sure that if tmp1==8 -> tmp1=0
-       shl tmp2=firstoff,3     // number of bits
-       ;;
-(p9)   ld8 lastval=[last]      // load, ahead of time, "last" word, if needed
-       shl tmp1=tmp1,3         // number of bits
-(p9)   adds tmp3=-8,tmp3       // effectively loaded
-       ;;
-(p8)   mov lastval=r0          // we don't need lastval if first1==last
-       shl hmask=hmask,tmp2    // build head mask, mask off [0,first1off[
-       shr.u tmask=tmask,tmp1  // build tail mask, mask off ]8,lastoff]
-       ;;
-       .body
-#define count tmp3
-
-(p8)   and hmask=hmask,tmask   // apply tail mask to head mask if 1 word only
-(p9)   and word2[0]=lastval,tmask      // mask last it as appropriate
-       shr.u count=count,3     // how many 8-byte?
-       ;;
-       // If count is odd, finish this 8-byte word so that we can
-       // load two back-to-back 8-byte words per loop thereafter.
-       and word1[0]=firstval,hmask     // and mask it as appropriate
-       tbit.nz p10,p11=count,0         // if (count is odd)
-       ;;
-(p8)   mov result1[0]=word1[0]
-(p9)   add result1[0]=word1[0],word2[0]
-       ;;
-       cmp.ltu p6,p0=result1[0],word1[0]       // check the carry
-       cmp.eq.or.andcm p8,p0=0,count           // exit if zero 8-byte
-       ;;
-(p6)   adds result1[0]=1,result1[0]
-(p8)   br.cond.dptk .do_csum_exit      // if (within an 8-byte word)
-(p11)  br.cond.dptk .do_csum16         // if (count is even)
-
-       // Here count is odd.
-       ld8 word1[1]=[first1],8         // load an 8-byte word
-       cmp.eq p9,p10=1,count           // if (count == 1)
-       adds count=-1,count             // loaded an 8-byte word
-       ;;
-       add result1[0]=result1[0],word1[1]
-       ;;
-       cmp.ltu p6,p0=result1[0],word1[1]
-       ;;
-(p6)   adds result1[0]=1,result1[0]
-(p9)   br.cond.sptk .do_csum_exit      // if (count == 1) exit
-       // Fall through to caluculate the checksum, feeding result1[0] as
-       // the initial value in result1[0].
-       //
-       // Calculate the checksum loading two 8-byte words per loop.
-       //
-.do_csum16:
-       add first2=8,first1
-       shr.u count=count,1     // we do 16 bytes per loop
-       ;;
-       adds count=-1,count
-       mov carry1=r0
-       mov carry2=r0
-       brp.loop.imp 1f,2f
-       ;;
-       mov ar.ec=PIPE_DEPTH
-       mov ar.lc=count // set lc
-       mov pr.rot=1<<16
-       // result1[0] must be initialized in advance.
-       mov result2[0]=r0
-       ;;
-       .align 32
-1:
-(ELD_1)        cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
-(pC1[1])adds carry1=1,carry1
-(ELD_1)        cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
-(pC2[1])adds carry2=1,carry2
-(ELD)  add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
-(ELD)  add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
-2:
-(p[0]) ld8 word1[0]=[first1],16
-(p[0]) ld8 word2[0]=[first2],16
-       br.ctop.sptk 1b
-       ;;
-       // Since len is a 32-bit value, carry cannot be larger than a 64-bit 
value.
-(pC1[1])adds carry1=1,carry1   // since we miss the last one
-(pC2[1])adds carry2=1,carry2
-       ;;
-       add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
-       add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
-       ;;
-       cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
-       cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
-       ;;
-(p6)   adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
-(p7)   adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
-       ;;
-       add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
-       ;;
-       cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
-       ;;
-(p6)   adds result1[0]=1,result1[0]
-       ;;
-.do_csum_exit:
-       //
-       // now fold 64 into 16 bits taking care of carry
-       // that's not very good because it has lots of sequentiality
-       //
-       mov tmp3=0xffff
-       zxt4 tmp1=result1[0]
-       shr.u tmp2=result1[0],32
-       ;;
-       add result1[0]=tmp1,tmp2
-       ;;
-       and tmp1=result1[0],tmp3
-       shr.u tmp2=result1[0],16
-       ;;
-       add result1[0]=tmp1,tmp2
-       ;;
-       and tmp1=result1[0],tmp3
-       shr.u tmp2=result1[0],16
-       ;;
-       add result1[0]=tmp1,tmp2
-       ;;
-       and tmp1=result1[0],tmp3
-       shr.u tmp2=result1[0],16
-       ;;
-       add ret0=tmp1,tmp2
-       mov pr=saved_pr,0xffffffffffff0000
-       ;;
-       // if buf was odd then swap bytes
-       mov ar.pfs=saved_pfs            // restore ar.ec
-(p15)  mux1 ret0=ret0,@rev             // reverse word
-       ;;
-       mov ar.lc=saved_lc
-(p15)  shr.u ret0=ret0,64-16   // + shift back to position = swap bytes
-       br.ret.sptk.many rp
-
-//     I (Jun Nakajima) wrote an equivalent code (see below), but it was
-//     not much better than the original. So keep the original there so that
-//     someone else can challenge.
-//
-//     shr.u word1[0]=result1[0],32
-//     zxt4 result1[0]=result1[0]
-//     ;;
-//     add result1[0]=result1[0],word1[0]
-//     ;;
-//     zxt2 result2[0]=result1[0]
-//     extr.u word1[0]=result1[0],16,16
-//     shr.u carry1=result1[0],32
-//     ;;
-//     add result2[0]=result2[0],word1[0]
-//     ;;
-//     add result2[0]=result2[0],carry1
-//     ;;
-//     extr.u ret0=result2[0],16,16
-//     ;;
-//     add ret0=ret0,result2[0]
-//     ;;
-//     zxt2 ret0=ret0
-//     mov ar.pfs=saved_pfs             // restore ar.ec
-//     mov pr=saved_pr,0xffffffffffff0000
-//     ;;
-//     // if buf was odd then swap bytes
-//     mov ar.lc=saved_lc
-//(p15)        mux1 ret0=ret0,@rev             // reverse word
-//     ;;
-//(p15)        shr.u ret0=ret0,64-16   // + shift back to position = swap bytes
-//     br.ret.sptk.many rp
-
-END(do_csum)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/io.c
--- a/xen/arch/ia64/linux/lib/io.c      Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,165 +0,0 @@
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/io.h>
-
-/*
- * Copy data from IO memory space to "real" memory space.
- * This needs to be optimized.
- */
-void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
-{
-       char *dst = to;
-
-       while (count) {
-               count--;
-               *dst++ = readb(from++);
-       }
-}
-EXPORT_SYMBOL(memcpy_fromio);
-
-/*
- * Copy data from "real" memory space to IO memory space.
- * This needs to be optimized.
- */
-void memcpy_toio(volatile void __iomem *to, const void *from, long count)
-{
-       const char *src = from;
-
-       while (count) {
-               count--;
-               writeb(*src++, to++);
-       }
-}
-EXPORT_SYMBOL(memcpy_toio);
-
-/*
- * "memset" on IO memory space.
- * This needs to be optimized.
- */
-void memset_io(volatile void __iomem *dst, int c, long count)
-{
-       unsigned char ch = (char)(c & 0xff);
-
-       while (count) {
-               count--;
-               writeb(ch, dst);
-               dst++;
-       }
-}
-EXPORT_SYMBOL(memset_io);
-
-#ifdef CONFIG_IA64_GENERIC
-
-#undef __ia64_inb
-#undef __ia64_inw
-#undef __ia64_inl
-#undef __ia64_outb
-#undef __ia64_outw
-#undef __ia64_outl
-#undef __ia64_readb
-#undef __ia64_readw
-#undef __ia64_readl
-#undef __ia64_readq
-#undef __ia64_readb_relaxed
-#undef __ia64_readw_relaxed
-#undef __ia64_readl_relaxed
-#undef __ia64_readq_relaxed
-#undef __ia64_writeb
-#undef __ia64_writew
-#undef __ia64_writel
-#undef __ia64_writeq
-#undef __ia64_mmiowb
-
-unsigned int
-__ia64_inb (unsigned long port)
-{
-       return ___ia64_inb(port);
-}
-
-unsigned int
-__ia64_inw (unsigned long port)
-{
-       return ___ia64_inw(port);
-}
-
-unsigned int
-__ia64_inl (unsigned long port)
-{
-       return ___ia64_inl(port);
-}
-
-void
-__ia64_outb (unsigned char val, unsigned long port)
-{
-       ___ia64_outb(val, port);
-}
-
-void
-__ia64_outw (unsigned short val, unsigned long port)
-{
-       ___ia64_outw(val, port);
-}
-
-void
-__ia64_outl (unsigned int val, unsigned long port)
-{
-       ___ia64_outl(val, port);
-}
-
-unsigned char
-__ia64_readb (void __iomem *addr)
-{
-       return ___ia64_readb (addr);
-}
-
-unsigned short
-__ia64_readw (void __iomem *addr)
-{
-       return ___ia64_readw (addr);
-}
-
-unsigned int
-__ia64_readl (void __iomem *addr)
-{
-       return ___ia64_readl (addr);
-}
-
-unsigned long
-__ia64_readq (void __iomem *addr)
-{
-       return ___ia64_readq (addr);
-}
-
-unsigned char
-__ia64_readb_relaxed (void __iomem *addr)
-{
-       return ___ia64_readb (addr);
-}
-
-unsigned short
-__ia64_readw_relaxed (void __iomem *addr)
-{
-       return ___ia64_readw (addr);
-}
-
-unsigned int
-__ia64_readl_relaxed (void __iomem *addr)
-{
-       return ___ia64_readl (addr);
-}
-
-unsigned long
-__ia64_readq_relaxed (void __iomem *addr)
-{
-       return ___ia64_readq (addr);
-}
-
-void
-__ia64_mmiowb(void)
-{
-       ___ia64_mmiowb();
-}
-
-#endif /* CONFIG_IA64_GENERIC */
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/ip_fast_csum.S
--- a/xen/arch/ia64/linux/lib/ip_fast_csum.S    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,90 +0,0 @@
-/*
- * Optmized version of the ip_fast_csum() function
- * Used for calculating IP header checksum
- *
- * Return: 16bit checksum, complemented
- *
- * Inputs:
- *      in0: address of buffer to checksum (char *)
- *      in1: length of the buffer (int)
- *
- * Copyright (C) 2002 Intel Corp.
- * Copyright (C) 2002 Ken Chen <kenneth.w.chen@xxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-/*
- * Since we know that most likely this function is called with buf aligned
- * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
- * versus calling generic version of do_csum, which has lots of overhead in
- * handling various alignments and sizes.  However, due to lack of constrains
- * put on the function input argument, cases with alignment not on 4-byte or
- * size not equal to 20 bytes will be handled by the generic do_csum function.
- */
-
-#define in0    r32
-#define in1    r33
-#define ret0   r8
-
-GLOBAL_ENTRY(ip_fast_csum)
-       .prologue
-       .body
-       cmp.ne  p6,p7=5,in1     // size other than 20 byte?
-       and     r14=3,in0       // is it aligned on 4-byte?
-       add     r15=4,in0       // second source pointer
-       ;;
-       cmp.ne.or.andcm p6,p7=r14,r0
-       ;;
-(p7)   ld4     r20=[in0],8
-(p7)   ld4     r21=[r15],8
-(p6)   br.spnt .generic
-       ;;
-       ld4     r22=[in0],8
-       ld4     r23=[r15],8
-       ;;
-       ld4     r24=[in0]
-       add     r20=r20,r21
-       add     r22=r22,r23
-       ;;
-       add     r20=r20,r22
-       ;;
-       add     r20=r20,r24
-       ;;
-       shr.u   ret0=r20,16     // now need to add the carry
-       zxt2    r20=r20
-       ;;
-       add     r20=ret0,r20
-       ;;
-       shr.u   ret0=r20,16     // add carry again
-       zxt2    r20=r20
-       ;;
-       add     r20=ret0,r20
-       ;;
-       shr.u   ret0=r20,16
-       zxt2    r20=r20
-       ;;
-       add     r20=ret0,r20
-       ;;
-       andcm   ret0=-1,r20
-       .restore sp             // reset frame state
-       br.ret.sptk.many b0
-       ;;
-
-.generic:
-       .prologue
-       .save ar.pfs, r35
-       alloc   r35=ar.pfs,2,2,2,0
-       .save rp, r34
-       mov     r34=b0
-       .body
-       dep.z   out1=in1,2,30
-       mov     out0=in0
-       ;;
-       br.call.sptk.many b0=do_csum
-       ;;
-       andcm   ret0=-1,ret0
-       mov     ar.pfs=r35
-       mov     b0=r34
-       br.ret.sptk.many b0
-END(ip_fast_csum)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memcpy.S
--- a/xen/arch/ia64/linux/lib/memcpy.S  Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,301 +0,0 @@
-/*
- *
- * Optimized version of the standard memcpy() function
- *
- * Inputs:
- *     in0:    destination address
- *     in1:    source address
- *     in2:    number of bytes to copy
- * Output:
- *     no return value
- *
- * Copyright (C) 2000-2001 Hewlett-Packard Co
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *     David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(memcpy)
-
-#      define MEM_LAT  21              /* latency to memory */
-
-#      define dst      r2
-#      define src      r3
-#      define retval   r8
-#      define saved_pfs r9
-#      define saved_lc r10
-#      define saved_pr r11
-#      define cnt      r16
-#      define src2     r17
-#      define t0       r18
-#      define t1       r19
-#      define t2       r20
-#      define t3       r21
-#      define t4       r22
-#      define src_end  r23
-
-#      define N        (MEM_LAT + 4)
-#      define Nrot     ((N + 7) & ~7)
-
-       /*
-        * First, check if everything (src, dst, len) is a multiple of eight.  
If
-        * so, we handle everything with no taken branches (other than the loop
-        * itself) and a small icache footprint.  Otherwise, we jump off to
-        * the more general copy routine handling arbitrary
-        * sizes/alignment etc.
-        */
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
-       .save ar.lc, saved_lc
-       mov saved_lc=ar.lc
-       or t0=in0,in1
-       ;;
-
-       or t0=t0,in2
-       .save pr, saved_pr
-       mov saved_pr=pr
-
-       .body
-
-       cmp.eq p6,p0=in2,r0     // zero length?
-       mov retval=in0          // return dst
-(p6)   br.ret.spnt.many rp     // zero length, return immediately
-       ;;
-
-       mov dst=in0             // copy because of rotation
-       shr.u cnt=in2,3         // number of 8-byte words to copy
-       mov pr.rot=1<<16
-       ;;
-
-       adds cnt=-1,cnt         // br.ctop is repeat/until
-       cmp.gtu p7,p0=16,in2    // copying less than 16 bytes?
-       mov ar.ec=N
-       ;;
-
-       and t0=0x7,t0
-       mov ar.lc=cnt
-       ;;
-       cmp.ne p6,p0=t0,r0
-
-       mov src=in1             // copy because of rotation
-(p7)   br.cond.spnt.few .memcpy_short
-(p6)   br.cond.spnt.few .memcpy_long
-       ;;
-       nop.m   0
-       ;;
-       nop.m   0
-       nop.i   0
-       ;;
-       nop.m   0
-       ;;
-       .rotr val[N]
-       .rotp p[N]
-       .align 32
-1: { .mib
-(p[0]) ld8 val[0]=[src],8
-       nop.i 0
-       brp.loop.imp 1b, 2f
-}
-2: { .mfb
-(p[N-1])st8 [dst]=val[N-1],8
-       nop.f 0
-       br.ctop.dptk.few 1b
-}
-       ;;
-       mov ar.lc=saved_lc
-       mov pr=saved_pr,-1
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       /*
-        * Small (<16 bytes) unaligned copying is done via a simple 
byte-at-the-time
-        * copy loop.  This performs relatively poorly on Itanium, but it 
doesn't
-        * get used very often (gcc inlines small copies) and due to atomicity
-        * issues, we want to avoid read-modify-write of entire words.
-        */
-       .align 32
-.memcpy_short:
-       adds cnt=-1,in2         // br.ctop is repeat/until
-       mov ar.ec=MEM_LAT
-       brp.loop.imp 1f, 2f
-       ;;
-       mov ar.lc=cnt
-       ;;
-       nop.m   0
-       ;;
-       nop.m   0
-       nop.i   0
-       ;;
-       nop.m   0
-       ;;
-       nop.m   0
-       ;;
-       /*
-        * It is faster to put a stop bit in the loop here because it makes
-        * the pipeline shorter (and latency is what matters on short copies).
-        */
-       .align 32
-1: { .mib
-(p[0]) ld1 val[0]=[src],1
-       nop.i 0
-       brp.loop.imp 1b, 2f
-} ;;
-2: { .mfb
-(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
-       nop.f 0
-       br.ctop.dptk.few 1b
-} ;;
-       mov ar.lc=saved_lc
-       mov pr=saved_pr,-1
-       mov ar.pfs=saved_pfs
-       br.ret.sptk.many rp
-
-       /*
-        * Large (>= 16 bytes) copying is done in a fancy way.  Latency isn't
-        * an overriding concern here, but throughput is.  We first do
-        * sub-word copying until the destination is aligned, then we check
-        * if the source is also aligned.  If so, we do a simple load/store-loop
-        * until there are less than 8 bytes left over and then we do the tail,
-        * by storing the last few bytes using sub-word copying.  If the source
-        * is not aligned, we branch off to the non-congruent loop.
-        *
-        *   stage:   op:
-        *         0  ld
-        *         :
-        * MEM_LAT+3  shrp
-        * MEM_LAT+4  st
-        *
-        * On Itanium, the pipeline itself runs without stalls.  However,  
br.ctop
-        * seems to introduce an unavoidable bubble in the pipeline so the 
overall
-        * latency is 2 cycles/iteration.  This gives us a _copy_ throughput
-        * of 4 byte/cycle.  Still not bad.
-        */
-#      undef N
-#      undef Nrot
-#      define N        (MEM_LAT + 5)           /* number of stages */
-#      define Nrot     ((N+1 + 2 + 7) & ~7)    /* number of rotating regs */
-
-#define LOG_LOOP_SIZE  6
-
-.memcpy_long:
-       alloc t3=ar.pfs,3,Nrot,0,Nrot   // resize register frame
-       and t0=-8,src           // t0 = src & ~7
-       and t2=7,src            // t2 = src & 7
-       ;;
-       ld8 t0=[t0]             // t0 = 1st source word
-       adds src2=7,src         // src2 = (src + 7)
-       sub t4=r0,dst           // t4 = -dst
-       ;;
-       and src2=-8,src2        // src2 = (src + 7) & ~7
-       shl t2=t2,3             // t2 = 8*(src & 7)
-       shl t4=t4,3             // t4 = 8*(dst & 7)
-       ;;
-       ld8 t1=[src2]           // t1 = 1st source word if src is 8-byte 
aligned, 2nd otherwise
-       sub t3=64,t2            // t3 = 64-8*(src & 7)
-       shr.u t0=t0,t2
-       ;;
-       add src_end=src,in2
-       shl t1=t1,t3
-       mov pr=t4,0x38          // (p5,p4,p3)=(dst & 7)
-       ;;
-       or t0=t0,t1
-       mov cnt=r0
-       adds src_end=-1,src_end
-       ;;
-(p3)   st1 [dst]=t0,1
-(p3)   shr.u t0=t0,8
-(p3)   adds cnt=1,cnt
-       ;;
-(p4)   st2 [dst]=t0,2
-(p4)   shr.u t0=t0,16
-(p4)   adds cnt=2,cnt
-       ;;
-(p5)   st4 [dst]=t0,4
-(p5)   adds cnt=4,cnt
-       and src_end=-8,src_end  // src_end = last word of source buffer
-       ;;
-
-       // At this point, dst is aligned to 8 bytes and there at least 16-7=9 
bytes left to copy:
-
-1:{    add src=cnt,src                 // make src point to remainder of 
source buffer
-       sub cnt=in2,cnt                 // cnt = number of bytes left to copy
-       mov t4=ip
-  }    ;;
-       and src2=-8,src                 // align source pointer
-       adds t4=.memcpy_loops-1b,t4
-       mov ar.ec=N
-
-       and t0=7,src                    // t0 = src & 7
-       shr.u t2=cnt,3                  // t2 = number of 8-byte words left to 
copy
-       shl cnt=cnt,3                   // move bits 0-2 to 3-5
-       ;;
-
-       .rotr val[N+1], w[2]
-       .rotp p[N]
-
-       cmp.ne p6,p0=t0,r0              // is src aligned, too?
-       shl t0=t0,LOG_LOOP_SIZE         // t0 = 8*(src & 7)
-       adds t2=-1,t2                   // br.ctop is repeat/until
-       ;;
-       add t4=t0,t4
-       mov pr=cnt,0x38                 // set (p5,p4,p3) to # of bytes 
last-word bytes to copy
-       mov ar.lc=t2
-       ;;
-       nop.m   0
-       ;;
-       nop.m   0
-       nop.i   0
-       ;;
-       nop.m   0
-       ;;
-(p6)   ld8 val[1]=[src2],8             // prime the pump...
-       mov b6=t4
-       br.sptk.few b6
-       ;;
-
-.memcpy_tail:
-       // At this point, (p5,p4,p3) are set to the number of bytes left to 
copy (which is
-       // less than 8) and t0 contains the last few bytes of the src buffer:
-(p5)   st4 [dst]=t0,4
-(p5)   shr.u t0=t0,32
-       mov ar.lc=saved_lc
-       ;;
-(p4)   st2 [dst]=t0,2
-(p4)   shr.u t0=t0,16
-       mov ar.pfs=saved_pfs
-       ;;
-(p3)   st1 [dst]=t0
-       mov pr=saved_pr,-1
-       br.ret.sptk.many rp
-
-///////////////////////////////////////////////////////
-       .align 64
-
-#define COPY(shift,index)                                                      
                \
- 1: { .mib                                                                     
                \
-       (p[0])          ld8 val[0]=[src2],8;                                    
                \
-       (p[MEM_LAT+3])  shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift;    
                \
-                       brp.loop.imp 1b, 2f                                     
                \
-    };                                                                         
                \
- 2: { .mfb                                                                     
                \
-       (p[MEM_LAT+4])  st8 [dst]=w[1],8;                                       
                \
-                       nop.f 0;                                                
                \
-                       br.ctop.dptk.few 1b;                                    
                \
-    };                                                                         
                \
-                       ;;                                                      
                \
-                       ld8 val[N-1]=[src_end]; /* load last word (may be same 
as val[N]) */    \
-                       ;;                                                      
                \
-                       shrp t0=val[N-1],val[N-index],shift;                    
                \
-                       br .memcpy_tail
-.memcpy_loops:
-       COPY(0, 1) /* no point special casing this---it doesn't go any faster 
without shrp */
-       COPY(8, 0)
-       COPY(16, 0)
-       COPY(24, 0)
-       COPY(32, 0)
-       COPY(40, 0)
-       COPY(48, 0)
-       COPY(56, 0)
-
-END(memcpy)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strlen_user.S
--- a/xen/arch/ia64/linux/lib/strlen_user.S     Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,198 +0,0 @@
-/*
- * Optimized version of the strlen_user() function
- *
- * Inputs:
- *     in0     address of buffer
- *
- * Outputs:
- *     ret0    0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
- *     David Mosberger-Tang <davidm@xxxxxxxxxx>
- *     Stephane Eranian <eranian@xxxxxxxxxx>
- *
- * 01/19/99 S.Eranian heavily enhanced version (see details below)
- * 09/24/99 S.Eranian added speculation recovery code
- */
-
-#include <asm/asmmacro.h>
-
-//
-// int strlen_user(char *)
-// ------------------------
-// Returns:
-//     - length of string + 1
-//     - 0 in case an exception is raised
-//
-// This is an enhanced version of the basic strlen_user. it includes a
-// combination of compute zero index (czx), parallel comparisons, speculative
-// loads and loop unroll using rotating registers.
-//
-// General Ideas about the algorithm:
-//       The goal is to look at the string in chunks of 8 bytes.
-//       so we need to do a few extra checks at the beginning because the
-//       string may not be 8-byte aligned. In this case we load the 8byte
-//       quantity which includes the start of the string and mask the unused
-//       bytes with 0xff to avoid confusing czx.
-//       We use speculative loads and software pipelining to hide memory
-//       latency and do read ahead safely. This way we defer any exception.
-//
-//       Because we don't want the kernel to be relying on particular
-//       settings of the DCR register, we provide recovery code in case
-//       speculation fails. The recovery code is going to "redo" the work using
-//       only normal loads. If we still get a fault then we return an
-//       error (ret0=0). Otherwise we return the strlen+1 as usual.
-//       The fact that speculation may fail can be caused, for instance, by
-//       the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
-//       a NaT bit will be set if the translation is not present. The normal
-//       load, on the other hand, will cause the translation to be inserted
-//       if the mapping exists.
-//
-//       It should be noted that we execute recovery code only when we need
-//       to use the data that has been speculatively loaded: we don't execute
-//       recovery code on pure read ahead data.
-//
-// Remarks:
-//     - the cmp r0,r0 is used as a fast way to initialize a predicate
-//       register to 1. This is required to make sure that we get the parallel
-//       compare correct.
-//
-//     - we don't use the epilogue counter to exit the loop but we need to set
-//       it to zero beforehand.
-//
-//     - after the loop we must test for Nat values because neither the
-//       czx nor cmp instruction raise a NaT consumption fault. We must be
-//       careful not to look too far for a Nat for which we don't care.
-//       For instance we don't need to look at a NaT in val2 if the zero byte
-//       was in val1.
-//
-//     - Clearly performance tuning is required.
-//
-
-#define saved_pfs      r11
-#define        tmp             r10
-#define base           r16
-#define orig           r17
-#define saved_pr       r18
-#define src            r19
-#define mask           r20
-#define val            r21
-#define val1           r22
-#define val2           r23
-
-GLOBAL_ENTRY(__strlen_user)
-       .prologue
-       .save ar.pfs, saved_pfs
-       alloc saved_pfs=ar.pfs,11,0,0,8
-
-       .rotr v[2], w[2]        // declares our 4 aliases
-
-       extr.u tmp=in0,0,3      // tmp=least significant 3 bits
-       mov orig=in0            // keep trackof initial byte address
-       dep src=0,in0,0,3       // src=8byte-aligned in0 address
-       .save pr, saved_pr
-       mov saved_pr=pr         // preserve predicates (rotation)
-       ;;
-
-       .body
-
-       ld8.s v[1]=[src],8      // load the initial 8bytes (must speculate)
-       shl tmp=tmp,3           // multiply by 8bits/byte
-       mov mask=-1             // our mask
-       ;;
-       ld8.s w[1]=[src],8      // load next 8 bytes in 2nd pipeline
-       cmp.eq p6,p0=r0,r0      // sets p6 (required because of // cmp.and)
-       sub tmp=64,tmp          // how many bits to shift our mask on the right
-       ;;
-       shr.u   mask=mask,tmp   // zero enough bits to hold v[1] valuable part
-       mov ar.ec=r0            // clear epilogue counter (saved in ar.pfs)
-       ;;
-       add base=-16,src        // keep track of aligned base
-       chk.s v[1], .recover    // if already NaT, then directly skip to recover
-       or v[1]=v[1],mask       // now we have a safe initial byte pattern
-       ;;
-1:
-       ld8.s v[0]=[src],8      // speculatively load next
-       czx1.r val1=v[1]        // search 0 byte from right
-       czx1.r val2=w[1]        // search 0 byte from right following 8bytes
-       ;;
-       ld8.s w[0]=[src],8      // speculatively load next to next
-       cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
-       cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
-(p6)   br.wtop.dptk.few 1b     // loop until p6 == 0
-       ;;
-       //
-       // We must return try the recovery code iff
-       // val1_is_nat || (val1==8 && val2_is_nat)
-       //
-       // XXX Fixme
-       //      - there must be a better way of doing the test
-       //
-       cmp.eq  p8,p9=8,val1    // p6 = val1 had zero (disambiguate)
-       tnat.nz p6,p7=val1      // test NaT on val1
-(p6)   br.cond.spnt .recover   // jump to recovery if val1 is NaT
-       ;;
-       //
-       // if we come here p7 is true, i.e., initialized for // cmp
-       //
-       cmp.eq.and  p7,p0=8,val1// val1==8?
-       tnat.nz.and p7,p0=val2  // test NaT if val2
-(p7)   br.cond.spnt .recover   // jump to recovery if val2 is NaT
-       ;;
-(p8)   mov val1=val2           // val2 contains the value
-(p8)   adds src=-16,src        // correct position when 3 ahead
-(p9)   adds src=-24,src        // correct position when 4 ahead
-       ;;
-       sub ret0=src,orig       // distance from origin
-       sub tmp=7,val1          // 7=8-1 because this strlen returns strlen+1
-       mov pr=saved_pr,0xffffffffffff0000
-       ;;
-       sub ret0=ret0,tmp       // length=now - back -1
-       mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
-       br.ret.sptk.many rp     // end of normal execution
-
-       //
-       // Outlined recovery code when speculation failed
-       //
-       // This time we don't use speculation and rely on the normal exception
-       // mechanism. that's why the loop is not as good as the previous one
-       // because read ahead is not possible
-       //
-       // XXX Fixme
-       //      - today we restart from the beginning of the string instead
-       //        of trying to continue where we left off.
-       //
-.recover:
-       EX(.Lexit1, ld8 val=[base],8)   // load the initial bytes
-       ;;
-       or val=val,mask                 // remask first bytes
-       cmp.eq p0,p6=r0,r0              // nullify first ld8 in loop
-       ;;
-       //
-       // ar.ec is still zero here
-       //
-2:
-       EX(.Lexit1, (p6) ld8 val=[base],8)
-       ;;
-       czx1.r val1=val         // search 0 byte from right
-       ;;
-       cmp.eq p6,p0=8,val1     // val1==8 ?
-(p6)   br.wtop.dptk.few 2b     // loop until p6 == 0
-       ;;
-       sub ret0=base,orig      // distance from base
-       sub tmp=7,val1          // 7=8-1 because this strlen returns strlen+1
-       mov pr=saved_pr,0xffffffffffff0000
-       ;;
-       sub ret0=ret0,tmp       // length=now - back -1
-       mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
-       br.ret.sptk.many rp     // end of successful recovery code
-
-       //
-       // We failed even on the normal load (called from exception handler)
-       //
-.Lexit1:
-       mov ret0=0
-       mov pr=saved_pr,0xffffffffffff0000
-       mov ar.pfs=saved_pfs    // because of ar.ec, restore no matter what
-       br.ret.sptk.many rp
-END(__strlen_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strncpy_from_user.S
--- a/xen/arch/ia64/linux/lib/strncpy_from_user.S       Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,44 +0,0 @@
-/*
- * Just like strncpy() except that if a fault occurs during copying,
- * -EFAULT is returned.
- *
- * Inputs:
- *     in0:    address of destination buffer
- *     in1:    address of string to be copied
- *     in2:    length of buffer in bytes
- * Outputs:
- *     r8:     -EFAULT in case of fault or number of bytes copied if no fault
- *
- * Copyright (C) 1998-2001 Hewlett-Packard Co
- * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
- *
- * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
- *                      by Andreas Schwab <schwab@xxxxxxx>).
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strncpy_from_user)
-       alloc r2=ar.pfs,3,0,0,0
-       mov r8=0
-       mov r9=in1
-       ;;
-       add r10=in1,in2
-       cmp.eq p6,p0=r0,in2
-(p6)   br.ret.spnt.many rp
-
-       // XXX braindead copy loop---this needs to be optimized
-.Loop1:
-       EX(.Lexit, ld1 r8=[in1],1)
-       ;;
-       EX(.Lexit, st1 [in0]=r8,1)
-       cmp.ne p6,p7=r8,r0
-       ;;
-(p6)   cmp.ne.unc p8,p0=in1,r10
-(p8)   br.cond.dpnt.few .Loop1
-       ;;
-(p6)   mov r8=in2              // buffer filled up---return buffer length
-(p7)   sub r8=in1,r9,1         // return string length (excluding NUL 
character)
-[.Lexit:]
-       br.ret.sptk.many rp
-END(__strncpy_from_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strnlen_user.S
--- a/xen/arch/ia64/linux/lib/strnlen_user.S    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,45 +0,0 @@
-/*
- * Returns 0 if exception before NUL or reaching the supplied limit (N),
- * a value greater than N if the string is longer than the limit, else
- * strlen.
- *
- * Inputs:
- *     in0:    address of buffer
- *     in1:    string length limit N
- * Outputs:
- *     r8:     0 in case of fault, strlen(buffer)+1 otherwise
- *
- * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(__strnlen_user)
-       .prologue
-       alloc r2=ar.pfs,2,0,0,0
-       .save ar.lc, r16
-       mov r16=ar.lc                   // preserve ar.lc
-
-       .body
-
-       add r3=-1,in1
-       ;;
-       mov ar.lc=r3
-       mov r9=0
-       ;;
-       // XXX braindead strlen loop---this needs to be optimized
-.Loop1:
-       EXCLR(.Lexit, ld1 r8=[in0],1)
-       add r9=1,r9
-       ;;
-       cmp.eq p6,p0=r8,r0
-(p6)   br.cond.dpnt .Lexit
-       br.cloop.dptk.few .Loop1
-
-       add r9=1,in1                    // NUL not found---return N+1
-       ;;
-.Lexit:
-       mov r8=r9
-       mov ar.lc=r16                   // restore ar.lc
-       br.ret.sptk.many rp
-END(__strnlen_user)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/xor.S
--- a/xen/arch/ia64/linux/lib/xor.S     Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,184 +0,0 @@
-/*
- * arch/ia64/lib/xor.S
- *
- * Optimized RAID-5 checksumming functions for IA-64.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(xor_ia64_2)
-       .prologue
-       .fframe 0
-       .save ar.pfs, r31
-       alloc r31 = ar.pfs, 3, 0, 13, 16
-       .save ar.lc, r30
-       mov r30 = ar.lc
-       .save pr, r29
-       mov r29 = pr
-       ;;
-       .body
-       mov r8 = in1
-       mov ar.ec = 6 + 2
-       shr in0 = in0, 3
-       ;;
-       adds in0 = -1, in0
-       mov r16 = in1
-       mov r17 = in2
-       ;;
-       mov ar.lc = in0
-       mov pr.rot = 1 << 16
-       ;;
-       .rotr s1[6+1], s2[6+1], d[2]
-       .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[6+1])st8.nta [r8] = d[1], 8
-       nop.f 0
-       br.ctop.dptk.few 0b
-       ;;
-       mov ar.lc = r30
-       mov pr = r29, -1
-       br.ret.sptk.few rp
-END(xor_ia64_2)
-
-GLOBAL_ENTRY(xor_ia64_3)
-       .prologue
-       .fframe 0
-       .save ar.pfs, r31
-       alloc r31 = ar.pfs, 4, 0, 20, 24
-       .save ar.lc, r30
-       mov r30 = ar.lc
-       .save pr, r29
-       mov r29 = pr
-       ;;
-       .body
-       mov r8 = in1
-       mov ar.ec = 6 + 2
-       shr in0 = in0, 3
-       ;;
-       adds in0 = -1, in0
-       mov r16 = in1
-       mov r17 = in2
-       ;;
-       mov r18 = in3
-       mov ar.lc = in0
-       mov pr.rot = 1 << 16
-       ;;
-       .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
-       .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-       ;;
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], s3[6]
-       br.ctop.dptk.few 0b
-       ;;
-       mov ar.lc = r30
-       mov pr = r29, -1
-       br.ret.sptk.few rp
-END(xor_ia64_3)
-
-GLOBAL_ENTRY(xor_ia64_4)
-       .prologue
-       .fframe 0
-       .save ar.pfs, r31
-       alloc r31 = ar.pfs, 5, 0, 27, 32
-       .save ar.lc, r30
-       mov r30 = ar.lc
-       .save pr, r29
-       mov r29 = pr
-       ;;
-       .body
-       mov r8 = in1
-       mov ar.ec = 6 + 2
-       shr in0 = in0, 3
-       ;;
-       adds in0 = -1, in0
-       mov r16 = in1
-       mov r17 = in2
-       ;;
-       mov r18 = in3
-       mov ar.lc = in0
-       mov pr.rot = 1 << 16
-       mov r19 = in4
-       ;;
-       .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
-       .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r20 = s3[6], s4[6]
-       ;;
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r20
-       br.ctop.dptk.few 0b
-       ;;
-       mov ar.lc = r30
-       mov pr = r29, -1
-       br.ret.sptk.few rp
-END(xor_ia64_4)
-
-GLOBAL_ENTRY(xor_ia64_5)
-       .prologue
-       .fframe 0
-       .save ar.pfs, r31
-       alloc r31 = ar.pfs, 6, 0, 34, 40
-       .save ar.lc, r30
-       mov r30 = ar.lc
-       .save pr, r29
-       mov r29 = pr
-       ;;
-       .body
-       mov r8 = in1
-       mov ar.ec = 6 + 2
-       shr in0 = in0, 3
-       ;;
-       adds in0 = -1, in0
-       mov r16 = in1
-       mov r17 = in2
-       ;;
-       mov r18 = in3
-       mov ar.lc = in0
-       mov pr.rot = 1 << 16
-       mov r19 = in4
-       mov r20 = in5
-       ;;
-       .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
-       .rotp p[6+2]
-0:
-(p[0]) ld8.nta s1[0] = [r16], 8
-(p[0]) ld8.nta s2[0] = [r17], 8
-(p[6]) xor d[0] = s1[6], s2[6]
-(p[0]) ld8.nta s3[0] = [r18], 8
-(p[0]) ld8.nta s4[0] = [r19], 8
-(p[6]) xor r21 = s3[6], s4[6]
-       ;;
-(p[0]) ld8.nta s5[0] = [r20], 8
-(p[6+1])st8.nta [r8] = d[1], 8
-(p[6]) xor d[0] = d[0], r21
-       ;;
-(p[6])   xor d[0] = d[0], s5[6]
-       nop.f 0
-       br.ctop.dptk.few 0b
-       ;;
-       mov ar.lc = r30
-       mov pr = r29, -1
-       br.ret.sptk.few rp
-END(xor_ia64_5)
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/minstate.h
--- a/xen/arch/ia64/linux/minstate.h    Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,254 +0,0 @@
-#include <linux/config.h>
-
-#include <asm/cache.h>
-
-#include "entry.h"
-
-/*
- * For ivt.s we want to access the stack virtually so we don't have to disable 
translation
- * on interrupts.
- *
- *  On entry:
- *     r1:     pointer to current task (ar.k6)
- */
-#define MINSTATE_START_SAVE_MIN_VIRT                                           
                \
-(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, 
little-endian, loadrs=0 */     \
-       ;;                                                                      
                \
-(pUStk)        mov.m r24=ar.rnat;                                              
                        \
-(pUStk)        addl r22=IA64_RBS_OFFSET,r1;                    /* compute base 
of RBS */               \
-(pKStk) mov r1=sp;                                     /* get sp  */           
                \
-       ;;                                                                      
                \
-(pUStk) lfetch.fault.excl.nt1 [r22];                                           
                \
-(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base 
of memory stack */      \
-(pUStk)        mov r23=ar.bspstore;                            /* save 
ar.bspstore */                  \
-       ;;                                                                      
                \
-(pUStk)        mov ar.bspstore=r22;                            /* switch to 
kernel RBS */              \
-(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;                 /* if in kernel mode, 
use sp (r12) */   \
-       ;;                                                                      
                \
-(pUStk)        mov r18=ar.bsp;                                                 
                        \
-(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, 
loadrs=0 */             \
-
-#define MINSTATE_END_SAVE_MIN_VIRT                                             
                \
-       bsw.1;                  /* switch back to bank 1 (must be last in insn 
group) */        \
-       ;;
-
-/*
- * For mca_asm.S we want to access the stack physically since the state is 
saved before we
- * go virtual and don't want to destroy the iip or ipsr.
- */
-#define MINSTATE_START_SAVE_MIN_PHYS                                           
                \
-(pKStk) mov r3=IA64_KR(PER_CPU_DATA);;                                         
                \
-(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;;                                   
                \
-(pKStk) ld8 r3 = [r3];;                                                        
                        \
-(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;;                            
                \
-(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3;                          
                \
-(pUStk)        mov ar.rsc=0;           /* set enforced lazy mode, pl 0, 
little-endian, loadrs=0 */     \
-(pUStk)        addl r22=IA64_RBS_OFFSET,r1;            /* compute base of 
register backing store */    \
-       ;;                                                                      
                \
-(pUStk)        mov r24=ar.rnat;                                                
                        \
-(pUStk)        addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1;   /* compute base 
of memory stack */      \
-(pUStk)        mov r23=ar.bspstore;                            /* save 
ar.bspstore */                  \
-(pUStk)        dep r22=-1,r22,61,3;                    /* compute kernel 
virtual addr of RBS */        \
-       ;;                                                                      
                \
-(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1;         /* if in kernel mode, use sp 
(r12) */           \
-(pUStk)        mov ar.bspstore=r22;                    /* switch to kernel RBS 
*/                      \
-       ;;                                                                      
                \
-(pUStk)        mov r18=ar.bsp;                                                 
                        \
-(pUStk)        mov ar.rsc=0x3;         /* set eager mode, pl 0, little-endian, 
loadrs=0 */             \
-
-#define MINSTATE_END_SAVE_MIN_PHYS                                             
                \
-       dep r12=-1,r12,61,3;            /* make sp a kernel virtual address */  
                \
-       ;;
-
-#ifdef MINSTATE_VIRT
-# define MINSTATE_GET_CURRENT(reg)     \
-               movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\
-               ld8 reg=[reg]
-# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_VIRT
-# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_VIRT
-#endif
-
-#ifdef MINSTATE_PHYS
-# define MINSTATE_GET_CURRENT(reg)     mov reg=IA64_KR(CURRENT);; tpa reg=reg
-# define MINSTATE_START_SAVE_MIN       MINSTATE_START_SAVE_MIN_PHYS
-# define MINSTATE_END_SAVE_MIN         MINSTATE_END_SAVE_MIN_PHYS
-#endif
-
-/*
- * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
- * the minimum state necessary that allows us to turn psr.ic back
- * on.
- *
- * Assumed state upon entry:
- *     psr.ic: off
- *     r31:    contains saved predicates (pr)
- *
- * Upon exit, the state is as follows:
- *     psr.ic: off
- *      r2 = points to &pt_regs.r16
- *      r8 = contents of ar.ccv
- *      r9 = contents of ar.csd
- *     r10 = contents of ar.ssd
- *     r11 = FPSR_DEFAULT
- *     r12 = kernel sp (kernel virtual address)
- *     r13 = points to current task_struct (kernel virtual address)
- *     p15 = TRUE if psr.i is set in cr.ipsr
- *     predicate registers (other than p2, p3, and p15), b6, r3, r14, r15:
- *             preserved
- *
- * Note that psr.ic is NOT turned on by this macro.  This is so that
- * we can pass interruption state as arguments to a handler.
- */
-#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA)                                      
                \
-       MINSTATE_GET_CURRENT(r16);      /* M (or M;;I) */                       
                \
-       mov r27=ar.rsc;                 /* M */                                 
                \
-       mov r20=r1;                     /* A */                                 
                \
-       mov r25=ar.unat;                /* M */                                 
                \
-       mov r29=cr.ipsr;                /* M */                                 
                \
-       mov r26=ar.pfs;                 /* I */                                 
                \
-       mov r28=cr.iip;                 /* M */                                 
                \
-       mov r21=ar.fpsr;                /* M */                                 
                \
-       COVER;                          /* B;; (or nothing) */                  
                \
-       ;;                                                                      
                \
-       adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16;                         
                \
-       ;;                                                                      
                \
-       ld1 r17=[r16];                          /* load 
current->thread.on_ustack flag */       \
-       st1 [r16]=r0;                           /* clear 
current->thread.on_ustack flag */      \
-       adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16                          
                \
-       /* switch from user to kernel RBS: */                                   
                \
-       ;;                                                                      
                \
-       invala;                         /* M */                                 
                \
-       SAVE_IFS;                                                               
                \
-       cmp.eq pKStk,pUStk=r0,r17;              /* are we in kernel mode 
already? */            \
-       ;;                                                                      
                \
-       MINSTATE_START_SAVE_MIN                                                 
                \
-       adds r17=2*L1_CACHE_BYTES,r1;           /* really: biggest cache-line 
size */           \
-       adds r16=PT(CR_IPSR),r1;                                                
                \
-       ;;                                                                      
                \
-       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES;                             
                \
-       st8 [r16]=r29;          /* save cr.ipsr */                              
                \
-       ;;                                                                      
                \
-       lfetch.fault.excl.nt1 [r17];                                            
                \
-       tbit.nz p15,p0=r29,IA64_PSR_I_BIT;                                      
                \
-       mov r29=b0                                                              
                \
-       ;;                                                                      
                \
-       adds r16=PT(R8),r1;     /* initialize first base pointer */             
                \
-       adds r17=PT(R9),r1;     /* initialize second base pointer */            
                \
-(pKStk)        mov r18=r0;             /* make sure r18 isn't NaT */           
                        \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r8,16;                                        
                        \
-.mem.offset 8,0; st8.spill [r17]=r9,16;                                        
                        \
-        ;;                                                                     
                \
-.mem.offset 0,0; st8.spill [r16]=r10,24;                                       
                \
-.mem.offset 8,0; st8.spill [r17]=r11,24;                                       
                \
-        ;;                                                                     
                \
-       st8 [r16]=r28,16;       /* save cr.iip */                               
                \
-       st8 [r17]=r30,16;       /* save cr.ifs */                               
                \
-(pUStk)        sub r18=r18,r22;        /* r18=RSE.ndirty*8 */                  
                        \
-       mov r8=ar.ccv;                                                          
                \
-       mov r9=ar.csd;                                                          
                \
-       mov r10=ar.ssd;                                                         
                \
-       movl r11=FPSR_DEFAULT;   /* L-unit */                                   
                \
-       ;;                                                                      
                \
-       st8 [r16]=r25,16;       /* save ar.unat */                              
                \
-       st8 [r17]=r26,16;       /* save ar.pfs */                               
                \
-       shl r18=r18,16;         /* compute ar.rsc to be used for "loadrs" */    
                \
-       ;;                                                                      
                \
-       st8 [r16]=r27,16;       /* save ar.rsc */                               
                \
-(pUStk)        st8 [r17]=r24,16;       /* save ar.rnat */                      
                        \
-(pKStk)        adds r17=16,r17;        /* skip over ar_rnat field */           
                        \
-       ;;                      /* avoid RAW on r16 & r17 */                    
                \
-(pUStk)        st8 [r16]=r23,16;       /* save ar.bspstore */                  
                        \
-       st8 [r17]=r31,16;       /* save predicates */                           
                \
-(pKStk)        adds r16=16,r16;        /* skip over ar_bspstore field */       
                        \
-       ;;                                                                      
                \
-       st8 [r16]=r29,16;       /* save b0 */                                   
                \
-       st8 [r17]=r18,16;       /* save ar.rsc value for "loadrs" */            
                \
-       cmp.eq pNonSys,pSys=r0,r0       /* initialize pSys=0, pNonSys=1 */      
                \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r20,16;       /* save original r1 */          
                \
-.mem.offset 8,0; st8.spill [r17]=r12,16;                                       
                \
-       adds r12=-16,r1;        /* switch to kernel memory stack (with 16 bytes 
of scratch) */  \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r13,16;                                       
                \
-.mem.offset 8,0; st8.spill [r17]=r21,16;       /* save ar.fpsr */              
                \
-       movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;                      
                \
-       ld8 r13=[r13];                  /* establish 'current' */               
                \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r15,16;                                       
                \
-.mem.offset 8,0; st8.spill [r17]=r14,16;                                       
                \
-       ;;                                                                      
                \
-.mem.offset 0,0; st8.spill [r16]=r2,16;                                        
                        \
-.mem.offset 8,0; st8.spill [r17]=r3,16;                                        
                        \
-       adds r2=IA64_PT_REGS_R16_OFFSET,r1;                                     
                \
-       ;;                                                                      
                \
-       EXTRA;                                                                  
                \
-       movl r1=__gp;           /* establish kernel global pointer */           
                \
-       ;;                                                                      
                \
-       MINSTATE_END_SAVE_MIN
-
-/*
- * SAVE_REST saves the remainder of pt_regs (with psr.ic on).
- *
- * Assumed state upon entry:
- *     psr.ic: on
- *     r2:     points to &pt_regs.r16
- *     r3:     points to &pt_regs.r17
- *     r8:     contents of ar.ccv
- *     r9:     contents of ar.csd
- *     r10:    contents of ar.ssd
- *     r11:    FPSR_DEFAULT
- *
- * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST.
- */
-#define SAVE_REST                              \
-.mem.offset 0,0; st8.spill [r2]=r16,16;                \
-.mem.offset 8,0; st8.spill [r3]=r17,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r18,16;                \
-.mem.offset 8,0; st8.spill [r3]=r19,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r20,16;                \
-.mem.offset 8,0; st8.spill [r3]=r21,16;                \
-       mov r18=b6;                             \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r22,16;                \
-.mem.offset 8,0; st8.spill [r3]=r23,16;                \
-       mov r19=b7;                             \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r24,16;                \
-.mem.offset 8,0; st8.spill [r3]=r25,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r26,16;                \
-.mem.offset 8,0; st8.spill [r3]=r27,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r28,16;                \
-.mem.offset 8,0; st8.spill [r3]=r29,16;                \
-       ;;                                      \
-.mem.offset 0,0; st8.spill [r2]=r30,16;                \
-.mem.offset 8,0; st8.spill [r3]=r31,32;                \
-       ;;                                      \
-       mov ar.fpsr=r11;        /* M-unit */    \
-       st8 [r2]=r8,8;          /* ar.ccv */    \
-       adds r24=PT(B6)-PT(F7),r3;              \
-       ;;                                      \
-       stf.spill [r2]=f6,32;                   \
-       stf.spill [r3]=f7,32;                   \
-       ;;                                      \
-       stf.spill [r2]=f8,32;                   \
-       stf.spill [r3]=f9,32;                   \
-       ;;                                      \
-       stf.spill [r2]=f10;                     \
-       stf.spill [r3]=f11;                     \
-       adds r25=PT(B7)-PT(F11),r3;             \
-       ;;                                      \
-       st8 [r24]=r18,16;       /* b6 */        \
-       st8 [r25]=r19,16;       /* b7 */        \
-       ;;                                      \
-       st8 [r24]=r9;           /* ar.csd */    \
-       st8 [r25]=r10;          /* ar.ssd */    \
-       ;;
-
-#define SAVE_MIN_WITH_COVER    DO_SAVE_MIN(cover, mov r30=cr.ifs,)
-#define SAVE_MIN_WITH_COVER_R19        DO_SAVE_MIN(cover, mov r30=cr.ifs, mov 
r15=r19)
-#define SAVE_MIN               DO_SAVE_MIN(     , mov r30=r0, )
diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/pdb-stub.c
--- a/xen/arch/ia64/pdb-stub.c  Tue Aug 30 23:51:51 2005
+++ /dev/null   Wed Aug 31 20:32:27 2005
@@ -1,59 +0,0 @@
-
-/*
- * pervasive debugger
- * www.cl.cam.ac.uk/netos/pdb
- *
- * alex ho
- * 2004
- * university of cambridge computer laboratory
- *
- * code adapted originally from kgdb, nemesis, & gdbserver
- */
-
-#include <xen/lib.h>
-#include <xen/sched.h>
-#include <asm/ptrace.h>
-#include <xen/keyhandler.h> 
-#include <asm/processor.h>
-#include <asm/pdb.h>
-#include <xen/list.h>
-#include <xen/serial.h>
-
-#define __PDB_GET_VAL 1
-#define __PDB_SET_VAL 2
-
-/*
- * Read or write memory in an address space
- */
-int pdb_change_values(u_char *buffer, int length,
-                     unsigned long cr3, unsigned long addr, int rw)
-{
-       dummy();
-       return 0;
-}
-
-/*
- * Set memory in a domain's address space
- * Set "length" bytes at "address" from "domain" to the values in "buffer".
- * Return the number of bytes set, 0 if there was a problem.
- */
-
-int pdb_set_values(u_char *buffer, int length,
-                  unsigned long cr3, unsigned long addr)
-{
-    int count = pdb_change_values(buffer, length, cr3, addr, __PDB_SET_VAL);
-    return count;
-}
-
-/*
- * Read memory from a domain's address space.
- * Fetch "length" bytes at "address" from "domain" into "buffer".
- * Return the number of bytes read, 0 if there was a problem.
- */
-
-int pdb_get_values(u_char *buffer, int length,
-                  unsigned long cr3, unsigned long addr)
-{
-  return pdb_change_values(buffer, length, cr3, addr, __PDB_GET_VAL);
-}
-

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
Prev by Date: [Xen-changelog] More updating to linux 2.6.13 sources
Next by Date: [Xen-changelog] Stil more cleanup and moving to 2.6.13 base
Previous by thread: [Xen-changelog] More updating to linux 2.6.13 sources
Next by thread: [Xen-changelog] Stil more cleanup and moving to 2.6.13 base
Index(es):
- Date
- Thread
Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.