[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Begin updating to 2.6.13 base
# HG changeset patch # User djm@xxxxxxxxxxxxxxx # Node ID b7276814008c9c924fceecf6fd9f67ccddaadcb2 # Parent 44316ce8327754a7a70c80ffff551e7c4619e066 Begin updating to 2.6.13 base diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/Makefile --- a/xen/arch/ia64/Makefile Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/Makefile Wed Aug 31 20:32:27 2005 @@ -1,18 +1,21 @@ include $(BASEDIR)/Rules.mk -VPATH = linux linux-xen +VPATH = linux linux-xen linux/lib +#VPATH = linux-xen linux/lib # libs-y += arch/ia64/lib/lib.a OBJS = xensetup.o setup.o time.o irq.o ia64_ksyms.o process.o smp.o \ - xenmisc.o pdb-stub.o acpi.o hypercall.o \ + xenmisc.o acpi.o hypercall.o \ machvec.o dom0_ops.o domain.o hpsimserial.o pcdp.o \ idle0_task.o pal.o hpsim.o efi.o efi_stub.o ivt.o mm_contig.o \ xenmem.o sal.o cmdline.o mm_init.o tlb.o smpboot.o \ - extable.o linuxextable.o xenirq.o xentime.o \ + extable.o linuxextable.o sort.o xenirq.o xentime.o \ regionreg.o entry.o unaligned.o privop.o vcpu.o \ irq_ia64.o irq_lsapic.o vhpt.o xenasm.o hyperprivop.o dom_fw.o \ grant_table.o sn_console.o + +#OBJS += idiv64.o idiv32.o \ # TMP holder to contain *.0 moved out of CONFIG_VTI OBJS += vmx_init.o @@ -22,6 +25,13 @@ vmx_phy_mode.o vmx_utility.o vmx_interrupt.o vmx_entry.o vmmu.o \ vtlb.o mmio.o vlsapic.o vmx_hypercall.o mm.o vmx_support.o pal_emul.o endif + +# files from xen/arch/ia64/linux/lib (linux/arch/ia64/lib) +OBJS += bitop.o clear_page.o flush.o copy_page_mck.o \ + memset.o strlen.o memcpy_mck.o \ + __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ + __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o + # perfmon.o # unwind.o needed for kernel unwinding (rare) @@ -30,8 +40,8 @@ # remove following line if not privifying in memory # OBJS += privify.o -default: $(OBJS) head.o ia64lib.o xen.lds.s - $(LD) -r -o arch.o $(OBJS) ia64lib.o +default: $(OBJS) head.o xen.lds.s + $(LD) -r -o arch.o $(OBJS) $(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \ -Map map.out head.o $(ALL_OBJS) -o $(TARGET)-syms $(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $(TARGET) @@ -79,12 +89,29 @@ $(CC) -E $(CPPFLAGS) -P -DXEN -D__ASSEMBLY__ \ -o xen.lds.s xen.lds.S -ia64lib.o: - $(MAKE) -C linux/lib && cp linux/lib/ia64lib.o . +# variants of divide/modulo +# see files in xen/arch/ia64/linux/lib (linux/arch/ia64/lib) +__divdi3.o: idiv64.S + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< +__udivdi3.o: idiv64.S + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< +__moddi3.o: idiv64.S + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< +__umoddi3.o: idiv64.S + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< +__divsi3.o: idiv32.S + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< +__udivsi3.o: idiv32.S + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< +__modsi3.o: idiv32.S + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< +__umodsi3.o: idiv32.S + $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< + clean: rm -f *.o *~ core xen.lds.s $(BASEDIR)/include/asm-ia64/.offsets.h.stamp asm-offsets.s rm -f asm-xsi-offsets.s $(BASEDIR)/include/asm-ia64/asm-xsi-offsets.h - rm -f lib/*.o + rm -f linux/lib/*.o .PHONY: default clean diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/setup.c --- a/xen/arch/ia64/linux-xen/setup.c Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux-xen/setup.c Wed Aug 31 20:32:27 2005 @@ -4,10 +4,15 @@ * Copyright (C) 1998-2001, 2003-2004 Hewlett-Packard Co * David Mosberger-Tang <davidm@xxxxxxxxxx> * Stephane Eranian <eranian@xxxxxxxxxx> - * Copyright (C) 2000, Rohit Seth <rohit.seth@xxxxxxxxx> + * Copyright (C) 2000, 2004 Intel Corp + * Rohit Seth <rohit.seth@xxxxxxxxx> + * Suresh Siddha <suresh.b.siddha@xxxxxxxxx> + * Gordon Jin <gordon.jin@xxxxxxxxx> * Copyright (C) 1999 VA Linux Systems * Copyright (C) 1999 Walt Drummond <drummond@xxxxxxxxxxx> * + * 12/26/04 S.Siddha, G.Jin, R.Seth + * Add multi-threading and multi-core detection * 11/12/01 D.Mosberger Convert get_cpuinfo() to seq_file based show_cpuinfo(). * 04/04/00 D.Mosberger renamed cpu_initialized to cpu_online_map * 03/31/00 R.Seth cpu_initialized and current->processor fixes @@ -15,6 +20,7 @@ * 02/01/00 R.Seth fixed get_cpuinfo for SMP * 01/07/99 S.Eranian added the support for command line argument * 06/24/99 W.Drummond added boot_cpu_data. + * 05/28/05 Z. Menyhart Dynamic stride size for "flush_icache_range()" */ #include <linux/config.h> #include <linux/module.h> @@ -35,6 +41,10 @@ #include <linux/serial_core.h> #include <linux/efi.h> #include <linux/initrd.h> +#ifndef XEN +#include <linux/platform.h> +#include <linux/pm.h> +#endif #include <asm/ia32.h> #include <asm/machvec.h> @@ -51,8 +61,10 @@ #include <asm/smp.h> #include <asm/system.h> #include <asm/unistd.h> +#ifdef XEN #include <asm/vmx.h> #include <asm/io.h> +#endif #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) # error "struct cpuinfo_ia64 too big!" @@ -64,12 +76,16 @@ #endif DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info); +#ifdef XEN DEFINE_PER_CPU(cpu_kr_ia64_t, cpu_kr); +#endif DEFINE_PER_CPU(unsigned long, local_per_cpu_offset); DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8); unsigned long ia64_cycles_per_usec; struct ia64_boot_param *ia64_boot_param; struct screen_info screen_info; +unsigned long vga_console_iobase; +unsigned long vga_console_membase; unsigned long ia64_max_cacheline_size; unsigned long ia64_iobase; /* virtual address for I/O accesses */ @@ -78,7 +94,12 @@ EXPORT_SYMBOL(io_space); unsigned int num_io_spaces; -unsigned char aux_device_present = 0xaa; /* XXX remove this when legacy I/O is gone */ +/* + * "flush_icache_range()" needs to know what processor dependent stride size to use + * when it makes i-cache(s) coherent with d-caches. + */ +#define I_CACHE_STRIDE_SHIFT 5 /* Safest way to go: 32 bytes by 32 bytes */ +unsigned long ia64_i_cache_stride_shift = ~0; /* * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1). This @@ -287,23 +308,25 @@ static inline int __init early_console_setup (char *cmdline) { + int earlycons = 0; + #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE { extern int sn_serial_console_early_setup(void); if (!sn_serial_console_early_setup()) - return 0; + earlycons++; } #endif #ifdef CONFIG_EFI_PCDP if (!efi_setup_pcdp_console(cmdline)) - return 0; + earlycons++; #endif #ifdef CONFIG_SERIAL_8250_CONSOLE if (!early_serial_console_init(cmdline)) - return 0; -#endif - - return -1; + earlycons++; +#endif + + return (earlycons) ? 0 : -1; } static inline void @@ -315,7 +338,34 @@ #endif } -void __init +#ifdef CONFIG_SMP +static void +check_for_logical_procs (void) +{ + pal_logical_to_physical_t info; + s64 status; + + status = ia64_pal_logical_to_phys(0, &info); + if (status == -1) { + printk(KERN_INFO "No logical to physical processor mapping " + "available\n"); + return; + } + if (status) { + printk(KERN_ERR "ia64_pal_logical_to_phys failed with %ld\n", + status); + return; + } + /* + * Total number of siblings that BSP has. Though not all of them + * may have booted successfully. The correct number of siblings + * booted is in info.overview_num_log. + */ + smp_num_siblings = info.overview_tpc; + smp_num_cpucores = info.overview_cpp; +} +#endif + #ifdef XEN early_setup_arch (char **cmdline_p) #else @@ -398,6 +448,19 @@ #ifdef CONFIG_SMP cpu_physical_id(0) = hard_smp_processor_id(); + + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); + + check_for_logical_procs(); + if (smp_num_cpucores > 1) + printk(KERN_INFO + "cpu package is Multi-Core capable: number of cores=%d\n", + smp_num_cpucores); + if (smp_num_siblings > 1) + printk(KERN_INFO + "cpu package is Multi-Threading capable: number of siblings=%d\n", + smp_num_siblings); #endif #ifdef XEN @@ -505,12 +568,23 @@ "cpu regs : %u\n" "cpu MHz : %lu.%06lu\n" "itc MHz : %lu.%06lu\n" - "BogoMIPS : %lu.%02lu\n\n", + "BogoMIPS : %lu.%02lu\n", cpunum, c->vendor, family, c->model, c->revision, c->archrev, features, c->ppn, c->number, c->proc_freq / 1000000, c->proc_freq % 1000000, c->itc_freq / 1000000, c->itc_freq % 1000000, lpj*HZ/500000, (lpj*HZ/5000) % 100); +#ifdef CONFIG_SMP + seq_printf(m, "siblings : %u\n", c->num_log); + if (c->threads_per_core > 1 || c->cores_per_socket > 1) + seq_printf(m, + "physical id: %u\n" + "core id : %u\n" + "thread id : %u\n", + c->socket_id, c->core_id, c->thread_id); +#endif + seq_printf(m,"\n"); + return 0; } @@ -581,6 +655,14 @@ memcpy(c->vendor, cpuid.field.vendor, 16); #ifdef CONFIG_SMP c->cpu = smp_processor_id(); + + /* below default values will be overwritten by identify_siblings() + * for Multi-Threading/Multi-Core capable cpu's + */ + c->threads_per_core = c->cores_per_socket = c->num_log = 1; + c->socket_id = -1; + + identify_siblings(c); #endif c->ppn = cpuid.field.ppn; c->number = cpuid.field.number; @@ -611,6 +693,12 @@ /* start_kernel() requires this... */ } +/* + * Calculate the max. cache line size. + * + * In addition, the minimum of the i-cache stride sizes is calculated for + * "flush_icache_range()". + */ static void get_max_cacheline_size (void) { @@ -624,6 +712,8 @@ printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n", __FUNCTION__, status); max = SMP_CACHE_BYTES; + /* Safest setup for "flush_icache_range()" */ + ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT; goto out; } @@ -632,14 +722,31 @@ &cci); if (status != 0) { printk(KERN_ERR - "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n", + "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n", __FUNCTION__, l, status); max = SMP_CACHE_BYTES; + /* The safest setup for "flush_icache_range()" */ + cci.pcci_stride = I_CACHE_STRIDE_SHIFT; + cci.pcci_unified = 1; } line_size = 1 << cci.pcci_line_size; if (line_size > max) max = line_size; - } + if (!cci.pcci_unified) { + status = ia64_pal_cache_config_info(l, + /* cache_type (instruction)= */ 1, + &cci); + if (status != 0) { + printk(KERN_ERR + "%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n", + __FUNCTION__, l, status); + /* The safest setup for "flush_icache_range()" */ + cci.pcci_stride = I_CACHE_STRIDE_SHIFT; + } + } + if (cci.pcci_stride < ia64_i_cache_stride_shift) + ia64_i_cache_stride_shift = cci.pcci_stride; + } out: if (max > ia64_max_cacheline_size) ia64_max_cacheline_size = max; @@ -700,7 +807,17 @@ ia64_set_kr(IA64_KR_FPU_OWNER, 0); /* - * Initialize default control register to defer all speculative faults. The + * Initialize the page-table base register to a global + * directory with all zeroes. This ensure that we can handle + * TLB-misses to user address-space even before we created the + * first user address-space. This may happen, e.g., due to + * aggressive use of lfetch.fault. + */ + ia64_set_kr(IA64_KR_PT_BASE, __pa(ia64_imva(empty_zero_page))); + + /* + * Initialize default control register to defer speculative faults except + * for those arising from TLB misses, which are not deferred. The * kernel MUST NOT depend on a particular setting of these bits (in other words, * the kernel must have recovery code for all speculative accesses). Turn on * dcr.lc as per recommendation by the architecture team. Most IA-32 apps @@ -762,6 +879,9 @@ /* size of physical stacked register partition plus 8 bytes: */ __get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8; platform_cpu_init(); +#ifndef XEN + pm_idle = default_idle; +#endif } void diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/extable.c --- a/xen/arch/ia64/linux/extable.c Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux/extable.c Wed Aug 31 20:32:27 2005 @@ -6,29 +6,29 @@ */ #include <linux/config.h> +#include <linux/sort.h> #include <asm/uaccess.h> #include <asm/module.h> -static inline int -compare_entries (struct exception_table_entry *l, struct exception_table_entry *r) +static int cmp_ex(const void *a, const void *b) { + const struct exception_table_entry *l = a, *r = b; u64 lip = (u64) &l->addr + l->addr; u64 rip = (u64) &r->addr + r->addr; + /* avoid overflow */ + if (lip > rip) + return 1; if (lip < rip) return -1; - if (lip == rip) - return 0; - else - return 1; + return 0; } -static inline void -swap_entries (struct exception_table_entry *l, struct exception_table_entry *r) +static void swap_ex(void *a, void *b, int size) { + struct exception_table_entry *l = a, *r = b, tmp; u64 delta = (u64) r - (u64) l; - struct exception_table_entry tmp; tmp = *l; l->addr = r->addr + delta; @@ -38,23 +38,20 @@ } /* - * Sort the exception table. It's usually already sorted, but there may be unordered - * entries due to multiple text sections (such as the .init text section). Note that the - * exception-table-entries contain location-relative addresses, which requires a bit of - * care during sorting to avoid overflows in the offset members (e.g., it would not be - * safe to make a temporary copy of an exception-table entry on the stack, because the - * stack may be more than 2GB away from the exception-table). + * Sort the exception table. It's usually already sorted, but there + * may be unordered entries due to multiple text sections (such as the + * .init text section). Note that the exception-table-entries contain + * location-relative addresses, which requires a bit of care during + * sorting to avoid overflows in the offset members (e.g., it would + * not be safe to make a temporary copy of an exception-table entry on + * the stack, because the stack may be more than 2GB away from the + * exception-table). */ -void -sort_extable (struct exception_table_entry *start, struct exception_table_entry *finish) +void sort_extable (struct exception_table_entry *start, + struct exception_table_entry *finish) { - struct exception_table_entry *p, *q; - - /* insertion sort */ - for (p = start + 1; p < finish; ++p) - /* start .. p-1 is sorted; push p down to it's proper place */ - for (q = p; q > start && compare_entries(&q[0], &q[-1]) < 0; --q) - swap_entries(&q[0], &q[-1]); + sort(start, finish - start, sizeof(struct exception_table_entry), + cmp_ex, swap_ex); } const struct exception_table_entry * diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/ia64_ksyms.c --- a/xen/arch/ia64/linux/ia64_ksyms.c Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux/ia64_ksyms.c Wed Aug 31 20:32:27 2005 @@ -57,9 +57,6 @@ EXPORT_SYMBOL(__strlen_user); EXPORT_SYMBOL(__strncpy_from_user); EXPORT_SYMBOL(__strnlen_user); - -#include <asm/unistd.h> -EXPORT_SYMBOL(__ia64_syscall); /* from arch/ia64/lib */ extern void __divsi3(void); diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/flush.S --- a/xen/arch/ia64/linux/lib/flush.S Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux/lib/flush.S Wed Aug 31 20:32:27 2005 @@ -1,39 +1,61 @@ /* * Cache flushing routines. * - * Copyright (C) 1999-2001 Hewlett-Packard Co - * Copyright (C) 1999-2001 David Mosberger-Tang <davidm@xxxxxxxxxx> + * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co + * David Mosberger-Tang <davidm@xxxxxxxxxx> + * + * 05/28/05 Zoltan Menyhart Dynamic stride size */ + #include <asm/asmmacro.h> -#include <asm/page.h> + /* * flush_icache_range(start,end) - * Must flush range from start to end-1 but nothing else (need to + * + * Make i-cache(s) coherent with d-caches. + * + * Must deal with range from start to end-1 but nothing else (need to * be careful not to touch addresses that may be unmapped). + * + * Note: "in0" and "in1" are preserved for debugging purposes. */ GLOBAL_ENTRY(flush_icache_range) + .prologue - alloc r2=ar.pfs,2,0,0,0 - sub r8=in1,in0,1 + alloc r2=ar.pfs,2,0,0,0 + movl r3=ia64_i_cache_stride_shift + mov r21=1 ;; - shr.u r8=r8,5 // we flush 32 bytes per iteration - .save ar.lc, r3 - mov r3=ar.lc // save ar.lc + ld8 r20=[r3] // r20: stride shift + sub r22=in1,r0,1 // last byte address + ;; + shr.u r23=in0,r20 // start / (stride size) + shr.u r22=r22,r20 // (last byte address) / (stride size) + shl r21=r21,r20 // r21: stride size of the i-cache(s) + ;; + sub r8=r22,r23 // number of strides - 1 + shl r24=r23,r20 // r24: addresses for "fc.i" = + // "start" rounded down to stride boundary + .save ar.lc,r3 + mov r3=ar.lc // save ar.lc ;; .body - - mov ar.lc=r8 + mov ar.lc=r8 ;; -.Loop: fc in0 // issuable on M0 only - add in0=32,in0 + /* + * 32 byte aligned loop, even number of (actually 2) bundles + */ +.Loop: fc.i r24 // issuable on M0 only + add r24=r21,r24 // we flush "stride size" bytes per iteration + nop.i 0 br.cloop.sptk.few .Loop ;; sync.i ;; srlz.i ;; - mov ar.lc=r3 // restore ar.lc + mov ar.lc=r3 // restore ar.lc br.ret.sptk.many rp END(flush_icache_range) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memcpy_mck.S --- a/xen/arch/ia64/linux/lib/memcpy_mck.S Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux/lib/memcpy_mck.S Wed Aug 31 20:32:27 2005 @@ -75,6 +75,7 @@ mov f6=f0 br.cond.sptk .common_code ;; +END(memcpy) GLOBAL_ENTRY(__copy_user) .prologue // check dest alignment @@ -300,7 +301,7 @@ add src_pre_mem=0,src0 // prefetch src pointer add dst_pre_mem=0,dst0 // prefetch dest pointer and src0=-8,src0 // 1st src pointer -(p7) mov ar.lc = r21 +(p7) mov ar.lc = cnt (p8) mov ar.lc = r0 ;; TEXT_ALIGN(32) @@ -524,7 +525,6 @@ #undef B #undef C #undef D -END(memcpy) /* * Due to lack of local tag support in gcc 2.x assembler, it is not clear which diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memset.S --- a/xen/arch/ia64/linux/lib/memset.S Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux/lib/memset.S Wed Aug 31 20:32:27 2005 @@ -57,10 +57,10 @@ { .mmi .prologue alloc tmp = ar.pfs, 3, 0, 0, 0 - .body lfetch.nt1 [dest] // .save ar.lc, save_lc mov.i save_lc = ar.lc + .body } { .mmi mov ret0 = dest // return value cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/pcdp.h --- a/xen/arch/ia64/linux/pcdp.h Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux/pcdp.h Wed Aug 31 20:32:27 2005 @@ -2,7 +2,7 @@ * Definitions for PCDP-defined console devices * * v1.0a: http://www.dig64.org/specifications/DIG64_HCDPv10a_01.pdf - * v2.0: http://www.dig64.org/specifications/DIG64_HCDPv20_042804.pdf + * v2.0: http://www.dig64.org/specifications/DIG64_PCDPv20.pdf * * (c) Copyright 2002, 2004 Hewlett-Packard Development Company, L.P. * Khalid Aziz <khalid.aziz@xxxxxx> @@ -52,11 +52,36 @@ u32 clock_rate; u8 pci_prog_intfc; u8 flags; -}; + u16 conout_index; + u32 reserved; +} __attribute__((packed)); + +#define PCDP_IF_PCI 1 + +/* pcdp_if_pci.trans */ +#define PCDP_PCI_TRANS_IOPORT 0x02 +#define PCDP_PCI_TRANS_MMIO 0x01 + +struct pcdp_if_pci { + u8 interconnect; + u8 reserved; + u16 length; + u8 segment; + u8 bus; + u8 dev; + u8 fun; + u16 dev_id; + u16 vendor_id; + u32 acpi_interrupt; + u64 mmio_tra; + u64 ioport_tra; + u8 flags; + u8 trans; +} __attribute__((packed)); struct pcdp_vga { u8 count; /* address space descriptors */ -}; +} __attribute__((packed)); /* pcdp_device.flags */ #define PCDP_PRIMARY_CONSOLE 1 @@ -66,7 +91,9 @@ u8 flags; u16 length; u16 efi_index; -}; + /* next data is pcdp_if_pci or pcdp_if_acpi (not yet supported) */ + /* next data is device specific type (currently only pcdp_vga) */ +} __attribute__((packed)); struct pcdp { u8 signature[4]; @@ -81,4 +108,4 @@ u32 num_uarts; struct pcdp_uart uart[0]; /* actual size is num_uarts */ /* remainder of table is pcdp_device structures */ -}; +} __attribute__((packed)); diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/minstate.h --- /dev/null Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux-xen/minstate.h Wed Aug 31 20:32:27 2005 @@ -0,0 +1,254 @@ +#include <linux/config.h> + +#include <asm/cache.h> + +#include "entry.h" + +/* + * For ivt.s we want to access the stack virtually so we don't have to disable translation + * on interrupts. + * + * On entry: + * r1: pointer to current task (ar.k6) + */ +#define MINSTATE_START_SAVE_MIN_VIRT \ +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ + ;; \ +(pUStk) mov.m r24=ar.rnat; \ +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \ +(pKStk) mov r1=sp; /* get sp */ \ + ;; \ +(pUStk) lfetch.fault.excl.nt1 [r22]; \ +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ + ;; \ +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ + ;; \ +(pUStk) mov r18=ar.bsp; \ +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ + +#define MINSTATE_END_SAVE_MIN_VIRT \ + bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ + ;; + +/* + * For mca_asm.S we want to access the stack physically since the state is saved before we + * go virtual and don't want to destroy the iip or ipsr. + */ +#define MINSTATE_START_SAVE_MIN_PHYS \ +(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \ +(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \ +(pKStk) ld8 r3 = [r3];; \ +(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \ +(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \ +(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ +(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \ + ;; \ +(pUStk) mov r24=ar.rnat; \ +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ +(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ +(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \ + ;; \ +(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ +(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ + ;; \ +(pUStk) mov r18=ar.bsp; \ +(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ + +#define MINSTATE_END_SAVE_MIN_PHYS \ + dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \ + ;; + +#ifdef MINSTATE_VIRT +# define MINSTATE_GET_CURRENT(reg) \ + movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\ + ld8 reg=[reg] +# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT +# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT +#endif + +#ifdef MINSTATE_PHYS +# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg +# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS +# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS +#endif + +/* + * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves + * the minimum state necessary that allows us to turn psr.ic back + * on. + * + * Assumed state upon entry: + * psr.ic: off + * r31: contains saved predicates (pr) + * + * Upon exit, the state is as follows: + * psr.ic: off + * r2 = points to &pt_regs.r16 + * r8 = contents of ar.ccv + * r9 = contents of ar.csd + * r10 = contents of ar.ssd + * r11 = FPSR_DEFAULT + * r12 = kernel sp (kernel virtual address) + * r13 = points to current task_struct (kernel virtual address) + * p15 = TRUE if psr.i is set in cr.ipsr + * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15: + * preserved + * + * Note that psr.ic is NOT turned on by this macro. This is so that + * we can pass interruption state as arguments to a handler. + */ +#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \ + MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \ + mov r27=ar.rsc; /* M */ \ + mov r20=r1; /* A */ \ + mov r25=ar.unat; /* M */ \ + mov r29=cr.ipsr; /* M */ \ + mov r26=ar.pfs; /* I */ \ + mov r28=cr.iip; /* M */ \ + mov r21=ar.fpsr; /* M */ \ + COVER; /* B;; (or nothing) */ \ + ;; \ + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \ + ;; \ + ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \ + st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \ + adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \ + /* switch from user to kernel RBS: */ \ + ;; \ + invala; /* M */ \ + SAVE_IFS; \ + cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \ + ;; \ + MINSTATE_START_SAVE_MIN \ + adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \ + adds r16=PT(CR_IPSR),r1; \ + ;; \ + lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \ + st8 [r16]=r29; /* save cr.ipsr */ \ + ;; \ + lfetch.fault.excl.nt1 [r17]; \ + tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \ + mov r29=b0 \ + ;; \ + adds r16=PT(R8),r1; /* initialize first base pointer */ \ + adds r17=PT(R9),r1; /* initialize second base pointer */ \ +(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r8,16; \ +.mem.offset 8,0; st8.spill [r17]=r9,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r10,24; \ +.mem.offset 8,0; st8.spill [r17]=r11,24; \ + ;; \ + st8 [r16]=r28,16; /* save cr.iip */ \ + st8 [r17]=r30,16; /* save cr.ifs */ \ +(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \ + mov r8=ar.ccv; \ + mov r9=ar.csd; \ + mov r10=ar.ssd; \ + movl r11=FPSR_DEFAULT; /* L-unit */ \ + ;; \ + st8 [r16]=r25,16; /* save ar.unat */ \ + st8 [r17]=r26,16; /* save ar.pfs */ \ + shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \ + ;; \ + st8 [r16]=r27,16; /* save ar.rsc */ \ +(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \ +(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \ + ;; /* avoid RAW on r16 & r17 */ \ +(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \ + st8 [r17]=r31,16; /* save predicates */ \ +(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \ + ;; \ + st8 [r16]=r29,16; /* save b0 */ \ + st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \ + cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \ +.mem.offset 8,0; st8.spill [r17]=r12,16; \ + adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r13,16; \ +.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \ + movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; \ + ld8 r13=[r13]; /* establish 'current' */ \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r15,16; \ +.mem.offset 8,0; st8.spill [r17]=r14,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r16]=r2,16; \ +.mem.offset 8,0; st8.spill [r17]=r3,16; \ + adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ + ;; \ + EXTRA; \ + movl r1=__gp; /* establish kernel global pointer */ \ + ;; \ + MINSTATE_END_SAVE_MIN + +/* + * SAVE_REST saves the remainder of pt_regs (with psr.ic on). + * + * Assumed state upon entry: + * psr.ic: on + * r2: points to &pt_regs.r16 + * r3: points to &pt_regs.r17 + * r8: contents of ar.ccv + * r9: contents of ar.csd + * r10: contents of ar.ssd + * r11: FPSR_DEFAULT + * + * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST. + */ +#define SAVE_REST \ +.mem.offset 0,0; st8.spill [r2]=r16,16; \ +.mem.offset 8,0; st8.spill [r3]=r17,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r18,16; \ +.mem.offset 8,0; st8.spill [r3]=r19,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r20,16; \ +.mem.offset 8,0; st8.spill [r3]=r21,16; \ + mov r18=b6; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r22,16; \ +.mem.offset 8,0; st8.spill [r3]=r23,16; \ + mov r19=b7; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r24,16; \ +.mem.offset 8,0; st8.spill [r3]=r25,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r26,16; \ +.mem.offset 8,0; st8.spill [r3]=r27,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r28,16; \ +.mem.offset 8,0; st8.spill [r3]=r29,16; \ + ;; \ +.mem.offset 0,0; st8.spill [r2]=r30,16; \ +.mem.offset 8,0; st8.spill [r3]=r31,32; \ + ;; \ + mov ar.fpsr=r11; /* M-unit */ \ + st8 [r2]=r8,8; /* ar.ccv */ \ + adds r24=PT(B6)-PT(F7),r3; \ + ;; \ + stf.spill [r2]=f6,32; \ + stf.spill [r3]=f7,32; \ + ;; \ + stf.spill [r2]=f8,32; \ + stf.spill [r3]=f9,32; \ + ;; \ + stf.spill [r2]=f10; \ + stf.spill [r3]=f11; \ + adds r25=PT(B7)-PT(F11),r3; \ + ;; \ + st8 [r24]=r18,16; /* b6 */ \ + st8 [r25]=r19,16; /* b7 */ \ + ;; \ + st8 [r24]=r9; /* ar.csd */ \ + st8 [r25]=r10; /* ar.ssd */ \ + ;; + +#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,) +#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19) +#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, ) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux-xen/sort.c --- /dev/null Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux-xen/sort.c Wed Aug 31 20:32:27 2005 @@ -0,0 +1,122 @@ +/* + * A fast, small, non-recursive O(nlog n) sort for the Linux kernel + * + * Jan 23 2005 Matt Mackall <mpm@xxxxxxxxxxx> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#ifdef XEN +#include <linux/types.h> +#endif + +void u32_swap(void *a, void *b, int size) +{ + u32 t = *(u32 *)a; + *(u32 *)a = *(u32 *)b; + *(u32 *)b = t; +} + +void generic_swap(void *a, void *b, int size) +{ + char t; + + do { + t = *(char *)a; + *(char *)a++ = *(char *)b; + *(char *)b++ = t; + } while (--size > 0); +} + +/* + * sort - sort an array of elements + * @base: pointer to data to sort + * @num: number of elements + * @size: size of each element + * @cmp: pointer to comparison function + * @swap: pointer to swap function or NULL + * + * This function does a heapsort on the given array. You may provide a + * swap function optimized to your element type. + * + * Sorting time is O(n log n) both on average and worst-case. While + * qsort is about 20% faster on average, it suffers from exploitable + * O(n*n) worst-case behavior and extra memory requirements that make + * it less suitable for kernel use. + */ + +void sort(void *base, size_t num, size_t size, + int (*cmp)(const void *, const void *), + void (*swap)(void *, void *, int size)) +{ + /* pre-scale counters for performance */ + int i = (num/2) * size, n = num * size, c, r; + + if (!swap) + swap = (size == 4 ? u32_swap : generic_swap); + + /* heapify */ + for ( ; i >= 0; i -= size) { + for (r = i; r * 2 < n; r = c) { + c = r * 2; + if (c < n - size && cmp(base + c, base + c + size) < 0) + c += size; + if (cmp(base + r, base + c) >= 0) + break; + swap(base + r, base + c, size); + } + } + + /* sort */ + for (i = n - size; i >= 0; i -= size) { + swap(base, base + i, size); + for (r = 0; r * 2 < i; r = c) { + c = r * 2; + if (c < i - size && cmp(base + c, base + c + size) < 0) + c += size; + if (cmp(base + r, base + c) >= 0) + break; + swap(base + r, base + c, size); + } + } +} + +EXPORT_SYMBOL(sort); + +#if 0 +/* a simple boot-time regression test */ + +int cmpint(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +static int sort_test(void) +{ + int *a, i, r = 1; + + a = kmalloc(1000 * sizeof(int), GFP_KERNEL); + BUG_ON(!a); + + printk("testing sort()\n"); + + for (i = 0; i < 1000; i++) { + r = (r * 725861) % 6599; + a[i] = r; + } + + sort(a, 1000, sizeof(int), cmpint, NULL); + + for (i = 0; i < 999; i++) + if (a[i] > a[i+1]) { + printk("sort() failed!\n"); + break; + } + + kfree(a); + + return 0; +} + +module_init(sort_test); +#endif diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/README.origin --- /dev/null Tue Aug 30 23:51:51 2005 +++ b/xen/arch/ia64/linux/README.origin Wed Aug 31 20:32:27 2005 @@ -0,0 +1,20 @@ +Source files in this directory are identical copies of linux-2.6.13 files: + +cmdline.c -> linux/lib/cmdline.c +efi_stub.S -> linux/arch/ia64/efi_stub.S +extable.c -> linux/arch/ia64/mm/extable.c +hpsim.S -> linux/arch/ia64/hp/sim/hpsim.S +ia64_ksyms.c -> linux/arch/ia64/kernel/ia64_ksyms.c +linuxextable.c -> linux/kernel/extable.c +machvec.c -> linux/arch/ia64/kernel/machvec.c +patch.c -> linux/arch/ia64/kernel/patch.c +pcdp.h -> drivers/firmware/pcdp.h +lib/bitop.c -> linux/arch/ia64/lib/bitop.c +lib/clear_page.S -> linux/arch/ia64/lib/clear_page.S +lib/copy_page_mck.S -> linux/arch/ia64/lib/copy_page_mck.S +lib/flush.S -> linux/arch/ia64/lib/flush.S +lib/idiv32.S -> linux/arch/ia64/lib/idiv32.S +lib/idiv64.S -> linux/arch/ia64/lib/idiv64.S +lib/memcpy_mck.S -> linux/arch/ia64/lib/memcpy_mck.S +lib/memset.S -> linux/arch/ia64/lib/memset.S +lib/strlen.S -> linux/arch/ia64/lib/strlen.S diff -r 44316ce83277 -r b7276814008c xen/include/asm-ia64/linux/sort.h --- /dev/null Tue Aug 30 23:51:51 2005 +++ b/xen/include/asm-ia64/linux/sort.h Wed Aug 31 20:32:27 2005 @@ -0,0 +1,10 @@ +#ifndef _LINUX_SORT_H +#define _LINUX_SORT_H + +#include <linux/types.h> + +void sort(void *base, size_t num, size_t size, + int (*cmp)(const void *, const void *), + void (*swap)(void *, void *, int)); + +#endif diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/lib/Makefile --- a/xen/arch/ia64/lib/Makefile Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,44 +0,0 @@ -# -# Makefile for ia64-specific library routines.. -# - -include $(BASEDIR)/Rules.mk - -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ - __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ - bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \ - clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \ - flush.o ip_fast_csum.o do_csum.o copy_user.o \ - memset.o strlen.o memcpy.o - -default: $(OBJS) - $(LD) -r -o ia64lib.o $(OBJS) - -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__ - -__divdi3.o: idiv64.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< - -__udivdi3.o: idiv64.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< - -__moddi3.o: idiv64.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< - -__umoddi3.o: idiv64.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< - -__divsi3.o: idiv32.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< - -__udivsi3.o: idiv32.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< - -__modsi3.o: idiv32.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< - -__umodsi3.o: idiv32.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< - -clean: - rm -f *.o *~ diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/Makefile --- a/xen/arch/ia64/linux/lib/Makefile Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,44 +0,0 @@ -# -# Makefile for ia64-specific library routines.. -# - -include $(BASEDIR)/Rules.mk - -OBJS := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ - __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \ - bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o \ - clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \ - flush.o ip_fast_csum.o do_csum.o copy_user.o \ - memset.o strlen.o memcpy.o - -default: $(OBJS) - $(LD) -r -o ia64lib.o $(OBJS) - -AFLAGS += -I$(BASEDIR)/include -D__ASSEMBLY__ - -__divdi3.o: idiv64.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< - -__udivdi3.o: idiv64.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< - -__moddi3.o: idiv64.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< - -__umoddi3.o: idiv64.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< - -__divsi3.o: idiv32.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -o $@ $< - -__udivsi3.o: idiv32.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DUNSIGNED -c -o $@ $< - -__modsi3.o: idiv32.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -c -o $@ $< - -__umodsi3.o: idiv32.S - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -c -DMODULO -DUNSIGNED -c -o $@ $< - -clean: - rm -f *.o *~ diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/carta_random.S --- a/xen/arch/ia64/linux/lib/carta_random.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,54 +0,0 @@ -/* - * Fast, simple, yet decent quality random number generator based on - * a paper by David G. Carta ("Two Fast Implementations of the - * `Minimal Standard' Random Number Generator," Communications of the - * ACM, January, 1990). - * - * Copyright (C) 2002 Hewlett-Packard Co - * David Mosberger-Tang <davidm@xxxxxxxxxx> - */ - -#include <asm/asmmacro.h> - -#define a r2 -#define m r3 -#define lo r8 -#define hi r9 -#define t0 r16 -#define t1 r17 -#define seed r32 - -GLOBAL_ENTRY(carta_random32) - movl a = (16807 << 16) | 16807 - ;; - pmpyshr2.u t0 = a, seed, 0 - pmpyshr2.u t1 = a, seed, 16 - ;; - unpack2.l t0 = t1, t0 - dep m = -1, r0, 0, 31 - ;; - zxt4 lo = t0 - shr.u hi = t0, 32 - ;; - dep t0 = 0, hi, 15, 49 // t0 = (hi & 0x7fff) - ;; - shl t0 = t0, 16 // t0 = (hi & 0x7fff) << 16 - shr t1 = hi, 15 // t1 = (hi >> 15) - ;; - add lo = lo, t0 - ;; - cmp.gtu p6, p0 = lo, m - ;; -(p6) and lo = lo, m - ;; -(p6) add lo = 1, lo - ;; - add lo = lo, t1 - ;; - cmp.gtu p6, p0 = lo, m - ;; -(p6) and lo = lo, m - ;; -(p6) add lo = 1, lo - br.ret.sptk.many rp -END(carta_random32) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/checksum.c --- a/xen/arch/ia64/linux/lib/checksum.c Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,102 +0,0 @@ -/* - * Network checksum routines - * - * Copyright (C) 1999, 2003 Hewlett-Packard Co - * Stephane Eranian <eranian@xxxxxxxxxx> - * - * Most of the code coming from arch/alpha/lib/checksum.c - * - * This file contains network checksum routines that are better done - * in an architecture-specific manner due to speed.. - */ - -#include <linux/module.h> -#include <linux/string.h> - -#include <asm/byteorder.h> - -static inline unsigned short -from64to16 (unsigned long x) -{ - /* add up 32-bit words for 33 bits */ - x = (x & 0xffffffff) + (x >> 32); - /* add up 16-bit and 17-bit words for 17+c bits */ - x = (x & 0xffff) + (x >> 16); - /* add up 16-bit and 2-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -/* - * computes the checksum of the TCP/UDP pseudo-header - * returns a 16-bit checksum, already complemented. - */ -unsigned short int -csum_tcpudp_magic (unsigned long saddr, unsigned long daddr, unsigned short len, - unsigned short proto, unsigned int sum) -{ - return ~from64to16(saddr + daddr + sum + ((unsigned long) ntohs(len) << 16) + - ((unsigned long) proto << 8)); -} - -EXPORT_SYMBOL(csum_tcpudp_magic); - -unsigned int -csum_tcpudp_nofold (unsigned long saddr, unsigned long daddr, unsigned short len, - unsigned short proto, unsigned int sum) -{ - unsigned long result; - - result = (saddr + daddr + sum + - ((unsigned long) ntohs(len) << 16) + - ((unsigned long) proto << 8)); - - /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */ - /* 64 to 33 */ - result = (result & 0xffffffff) + (result >> 32); - /* 33 to 32 */ - result = (result & 0xffffffff) + (result >> 32); - return result; -} - -extern unsigned long do_csum (const unsigned char *, long); - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic - * - * this function must be called with even lengths, except - * for the last fragment, which may be odd - * - * it's best to have buff aligned on a 32-bit boundary - */ -unsigned int -csum_partial (const unsigned char * buff, int len, unsigned int sum) -{ - unsigned long result = do_csum(buff, len); - - /* add in old sum, and carry.. */ - result += sum; - /* 32+c bits -> 32 bits */ - result = (result & 0xffffffff) + (result >> 32); - return result; -} - -EXPORT_SYMBOL(csum_partial); - -/* - * this routine is used for miscellaneous IP-like checksums, mainly - * in icmp.c - */ -unsigned short -ip_compute_csum (unsigned char * buff, int len) -{ - return ~do_csum(buff,len); -} - -EXPORT_SYMBOL(ip_compute_csum); diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/clear_user.S --- a/xen/arch/ia64/linux/lib/clear_user.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,209 +0,0 @@ -/* - * This routine clears to zero a linear memory buffer in user space. - * - * Inputs: - * in0: address of buffer - * in1: length of buffer in bytes - * Outputs: - * r8: number of bytes that didn't get cleared due to a fault - * - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co - * Stephane Eranian <eranian@xxxxxxxxxx> - */ - -#include <asm/asmmacro.h> - -// -// arguments -// -#define buf r32 -#define len r33 - -// -// local registers -// -#define cnt r16 -#define buf2 r17 -#define saved_lc r18 -#define saved_pfs r19 -#define tmp r20 -#define len2 r21 -#define len3 r22 - -// -// Theory of operations: -// - we check whether or not the buffer is small, i.e., less than 17 -// in which case we do the byte by byte loop. -// -// - Otherwise we go progressively from 1 byte store to 8byte store in -// the head part, the body is a 16byte store loop and we finish we the -// tail for the last 15 bytes. -// The good point about this breakdown is that the long buffer handling -// contains only 2 branches. -// -// The reason for not using shifting & masking for both the head and the -// tail is to stay semantically correct. This routine is not supposed -// to write bytes outside of the buffer. While most of the time this would -// be ok, we can't tolerate a mistake. A classical example is the case -// of multithreaded code were to the extra bytes touched is actually owned -// by another thread which runs concurrently to ours. Another, less likely, -// example is with device drivers where reading an I/O mapped location may -// have side effects (same thing for writing). -// - -GLOBAL_ENTRY(__do_clear_user) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,2,0,0,0 - cmp.eq p6,p0=r0,len // check for zero length - .save ar.lc, saved_lc - mov saved_lc=ar.lc // preserve ar.lc (slow) - .body - ;; // avoid WAW on CFM - adds tmp=-1,len // br.ctop is repeat/until - mov ret0=len // return value is length at this point -(p6) br.ret.spnt.many rp - ;; - cmp.lt p6,p0=16,len // if len > 16 then long memset - mov ar.lc=tmp // initialize lc for small count -(p6) br.cond.dptk .long_do_clear - ;; // WAR on ar.lc - // - // worst case 16 iterations, avg 8 iterations - // - // We could have played with the predicates to use the extra - // M slot for 2 stores/iteration but the cost the initialization - // the various counters compared to how long the loop is supposed - // to last on average does not make this solution viable. - // -1: - EX( .Lexit1, st1 [buf]=r0,1 ) - adds len=-1,len // countdown length using len - br.cloop.dptk 1b - ;; // avoid RAW on ar.lc - // - // .Lexit4: comes from byte by byte loop - // len contains bytes left -.Lexit1: - mov ret0=len // faster than using ar.lc - mov ar.lc=saved_lc - br.ret.sptk.many rp // end of short clear_user - - - // - // At this point we know we have more than 16 bytes to copy - // so we focus on alignment (no branches required) - // - // The use of len/len2 for countdown of the number of bytes left - // instead of ret0 is due to the fact that the exception code - // changes the values of r8. - // -.long_do_clear: - tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) - ;; - EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned -(p6) adds len=-1,len;; // sync because buf is modified - tbit.nz p6,p0=buf,1 - ;; - EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned -(p6) adds len=-2,len;; - tbit.nz p6,p0=buf,2 - ;; - EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned -(p6) adds len=-4,len;; - tbit.nz p6,p0=buf,3 - ;; - EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned -(p6) adds len=-8,len;; - shr.u cnt=len,4 // number of 128-bit (2x64bit) words - ;; - cmp.eq p6,p0=r0,cnt - adds tmp=-1,cnt -(p6) br.cond.dpnt .dotail // we have less than 16 bytes left - ;; - adds buf2=8,buf // setup second base pointer - mov ar.lc=tmp - ;; - - // - // 16bytes/iteration core loop - // - // The second store can never generate a fault because - // we come into the loop only when we are 16-byte aligned. - // This means that if we cross a page then it will always be - // in the first store and never in the second. - // - // - // We need to keep track of the remaining length. A possible (optimistic) - // way would be to use ar.lc and derive how many byte were left by - // doing : left= 16*ar.lc + 16. this would avoid the addition at - // every iteration. - // However we need to keep the synchronization point. A template - // M;;MB does not exist and thus we can keep the addition at no - // extra cycle cost (use a nop slot anyway). It also simplifies the - // (unlikely) error recovery code - // - -2: EX(.Lexit3, st8 [buf]=r0,16 ) - ;; // needed to get len correct when error - st8 [buf2]=r0,16 - adds len=-16,len - br.cloop.dptk 2b - ;; - mov ar.lc=saved_lc - // - // tail correction based on len only - // - // We alternate the use of len3,len2 to allow parallelism and correct - // error handling. We also reuse p6/p7 to return correct value. - // The addition of len2/len3 does not cost anything more compared to - // the regular memset as we had empty slots. - // -.dotail: - mov len2=len // for parallelization of error handling - mov len3=len - tbit.nz p6,p0=len,3 - ;; - EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes -(p6) adds len3=-8,len2 - tbit.nz p7,p6=len,2 - ;; - EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes -(p7) adds len2=-4,len3 - tbit.nz p6,p7=len,1 - ;; - EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes -(p6) adds len3=-2,len2 - tbit.nz p7,p6=len,0 - ;; - EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left - mov ret0=r0 // success - br.ret.sptk.many rp // end of most likely path - - // - // Outlined error handling code - // - - // - // .Lexit3: comes from core loop, need restore pr/lc - // len contains bytes left - // - // - // .Lexit2: - // if p6 -> coming from st8 or st2 : len2 contains what's left - // if p7 -> coming from st4 or st1 : len3 contains what's left - // We must restore lc/pr even though might not have been used. -.Lexit2: - .pred.rel "mutex", p6, p7 -(p6) mov len=len2 -(p7) mov len=len3 - ;; - // - // .Lexit4: comes from head, need not restore pr/lc - // len contains bytes left - // -.Lexit3: - mov ret0=len - mov ar.lc=saved_lc - br.ret.sptk.many rp -END(__do_clear_user) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/copy_page.S --- a/xen/arch/ia64/linux/lib/copy_page.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,98 +0,0 @@ -/* - * - * Optimized version of the standard copy_page() function - * - * Inputs: - * in0: address of target page - * in1: address of source page - * Output: - * no return value - * - * Copyright (C) 1999, 2001 Hewlett-Packard Co - * Stephane Eranian <eranian@xxxxxxxxxx> - * David Mosberger <davidm@xxxxxxxxxx> - * - * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies. - */ -#include <asm/asmmacro.h> -#include <asm/page.h> - -#define PIPE_DEPTH 3 -#define EPI p[PIPE_DEPTH-1] - -#define lcount r16 -#define saved_pr r17 -#define saved_lc r18 -#define saved_pfs r19 -#define src1 r20 -#define src2 r21 -#define tgt1 r22 -#define tgt2 r23 -#define srcf r24 -#define tgtf r25 -#define tgt_last r26 - -#define Nrot ((8*PIPE_DEPTH+7)&~7) - -GLOBAL_ENTRY(copy_page) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot - - .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \ - t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH] - .rotp p[PIPE_DEPTH] - - .save ar.lc, saved_lc - mov saved_lc=ar.lc - mov ar.ec=PIPE_DEPTH - - mov lcount=PAGE_SIZE/64-1 - .save pr, saved_pr - mov saved_pr=pr - mov pr.rot=1<<16 - - .body - - mov src1=in1 - adds src2=8,in1 - mov tgt_last = PAGE_SIZE - ;; - adds tgt2=8,in0 - add srcf=512,in1 - mov ar.lc=lcount - mov tgt1=in0 - add tgtf=512,in0 - add tgt_last = tgt_last, in0 - ;; -1: -(p[0]) ld8 t1[0]=[src1],16 -(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16 -(p[0]) ld8 t2[0]=[src2],16 -(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16 - cmp.ltu p6,p0 = tgtf, tgt_last - ;; -(p[0]) ld8 t3[0]=[src1],16 -(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16 -(p[0]) ld8 t4[0]=[src2],16 -(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16 - ;; -(p[0]) ld8 t5[0]=[src1],16 -(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16 -(p[0]) ld8 t6[0]=[src2],16 -(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16 - ;; -(p[0]) ld8 t7[0]=[src1],16 -(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16 -(p[0]) ld8 t8[0]=[src2],16 -(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16 - -(p6) lfetch [srcf], 64 -(p6) lfetch [tgtf], 64 - br.ctop.sptk.few 1b - ;; - mov pr=saved_pr,0xffffffffffff0000 // restore predicates - mov ar.pfs=saved_pfs - mov ar.lc=saved_lc - br.ret.sptk.many rp -END(copy_page) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/copy_user.S --- a/xen/arch/ia64/linux/lib/copy_user.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,610 +0,0 @@ -/* - * - * Optimized version of the copy_user() routine. - * It is used to copy date across the kernel/user boundary. - * - * The source and destination are always on opposite side of - * the boundary. When reading from user space we must catch - * faults on loads. When writing to user space we must catch - * errors on stores. Note that because of the nature of the copy - * we don't need to worry about overlapping regions. - * - * - * Inputs: - * in0 address of source buffer - * in1 address of destination buffer - * in2 number of bytes to copy - * - * Outputs: - * ret0 0 in case of success. The number of bytes NOT copied in - * case of error. - * - * Copyright (C) 2000-2001 Hewlett-Packard Co - * Stephane Eranian <eranian@xxxxxxxxxx> - * - * Fixme: - * - handle the case where we have more than 16 bytes and the alignment - * are different. - * - more benchmarking - * - fix extraneous stop bit introduced by the EX() macro. - */ - -#include <asm/asmmacro.h> - -// -// Tuneable parameters -// -#define COPY_BREAK 16 // we do byte copy below (must be >=16) -#define PIPE_DEPTH 21 // pipe depth - -#define EPI p[PIPE_DEPTH-1] - -// -// arguments -// -#define dst in0 -#define src in1 -#define len in2 - -// -// local registers -// -#define t1 r2 // rshift in bytes -#define t2 r3 // lshift in bytes -#define rshift r14 // right shift in bits -#define lshift r15 // left shift in bits -#define word1 r16 -#define word2 r17 -#define cnt r18 -#define len2 r19 -#define saved_lc r20 -#define saved_pr r21 -#define tmp r22 -#define val r23 -#define src1 r24 -#define dst1 r25 -#define src2 r26 -#define dst2 r27 -#define len1 r28 -#define enddst r29 -#define endsrc r30 -#define saved_pfs r31 - -GLOBAL_ENTRY(__copy_user) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) - - .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] - .rotp p[PIPE_DEPTH] - - adds len2=-1,len // br.ctop is repeat/until - mov ret0=r0 - - ;; // RAW of cfm when len=0 - cmp.eq p8,p0=r0,len // check for zero length - .save ar.lc, saved_lc - mov saved_lc=ar.lc // preserve ar.lc (slow) -(p8) br.ret.spnt.many rp // empty mempcy() - ;; - add enddst=dst,len // first byte after end of source - add endsrc=src,len // first byte after end of destination - .save pr, saved_pr - mov saved_pr=pr // preserve predicates - - .body - - mov dst1=dst // copy because of rotation - mov ar.ec=PIPE_DEPTH - mov pr.rot=1<<16 // p16=true all others are false - - mov src1=src // copy because of rotation - mov ar.lc=len2 // initialize lc for small count - cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy - - xor tmp=src,dst // same alignment test prepare -(p10) br.cond.dptk .long_copy_user - ;; // RAW pr.rot/p16 ? - // - // Now we do the byte by byte loop with software pipeline - // - // p7 is necessarily false by now -1: - EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) - br.ctop.dptk.few 1b - ;; - mov ar.lc=saved_lc - mov pr=saved_pr,0xffffffffffff0000 - mov ar.pfs=saved_pfs // restore ar.ec - br.ret.sptk.many rp // end of short memcpy - - // - // Not 8-byte aligned - // -.diff_align_copy_user: - // At this point we know we have more than 16 bytes to copy - // and also that src and dest do _not_ have the same alignment. - and src2=0x7,src1 // src offset - and dst2=0x7,dst1 // dst offset - ;; - // The basic idea is that we copy byte-by-byte at the head so - // that we can reach 8-byte alignment for both src1 and dst1. - // Then copy the body using software pipelined 8-byte copy, - // shifting the two back-to-back words right and left, then copy - // the tail by copying byte-by-byte. - // - // Fault handling. If the byte-by-byte at the head fails on the - // load, then restart and finish the pipleline by copying zeros - // to the dst1. Then copy zeros for the rest of dst1. - // If 8-byte software pipeline fails on the load, do the same as - // failure_in3 does. If the byte-by-byte at the tail fails, it is - // handled simply by failure_in_pipe1. - // - // The case p14 represents the source has more bytes in the - // the first word (by the shifted part), whereas the p15 needs to - // copy some bytes from the 2nd word of the source that has the - // tail of the 1st of the destination. - // - - // - // Optimization. If dst1 is 8-byte aligned (quite common), we don't need - // to copy the head to dst1, to start 8-byte copy software pipeline. - // We know src1 is not 8-byte aligned in this case. - // - cmp.eq p14,p15=r0,dst2 -(p15) br.cond.spnt 1f - ;; - sub t1=8,src2 - mov t2=src2 - ;; - shl rshift=t2,3 - sub len1=len,t1 // set len1 - ;; - sub lshift=64,rshift - ;; - br.cond.spnt .word_copy_user - ;; -1: - cmp.leu p14,p15=src2,dst2 - sub t1=dst2,src2 - ;; - .pred.rel "mutex", p14, p15 -(p14) sub word1=8,src2 // (8 - src offset) -(p15) sub t1=r0,t1 // absolute value -(p15) sub word1=8,dst2 // (8 - dst offset) - ;; - // For the case p14, we don't need to copy the shifted part to - // the 1st word of destination. - sub t2=8,t1 -(p14) sub word1=word1,t1 - ;; - sub len1=len,word1 // resulting len -(p15) shl rshift=t1,3 // in bits -(p14) shl rshift=t2,3 - ;; -(p14) sub len1=len1,t1 - adds cnt=-1,word1 - ;; - sub lshift=64,rshift - mov ar.ec=PIPE_DEPTH - mov pr.rot=1<<16 // p16=true all others are false - mov ar.lc=cnt - ;; -2: - EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) - br.ctop.dptk.few 2b - ;; - clrrrb - ;; -.word_copy_user: - cmp.gtu p9,p0=16,len1 -(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy - ;; - shr.u cnt=len1,3 // number of 64-bit words - ;; - adds cnt=-1,cnt - ;; - .pred.rel "mutex", p14, p15 -(p14) sub src1=src1,t2 -(p15) sub src1=src1,t1 - // - // Now both src1 and dst1 point to an 8-byte aligned address. And - // we have more than 8 bytes to copy. - // - mov ar.lc=cnt - mov ar.ec=PIPE_DEPTH - mov pr.rot=1<<16 // p16=true all others are false - ;; -3: - // - // The pipleline consists of 3 stages: - // 1 (p16): Load a word from src1 - // 2 (EPI_1): Shift right pair, saving to tmp - // 3 (EPI): Store tmp to dst1 - // - // To make it simple, use at least 2 (p16) loops to set up val1[n] - // because we need 2 back-to-back val1[] to get tmp. - // Note that this implies EPI_2 must be p18 or greater. - // - -#define EPI_1 p[PIPE_DEPTH-2] -#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift -#define CASE(pred, shift) \ - (pred) br.cond.spnt .copy_user_bit##shift -#define BODY(rshift) \ -.copy_user_bit##rshift: \ -1: \ - EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \ -(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ - EX(3f,(p16) ld8 val1[1]=[src1],8); \ -(p16) mov val1[0]=r0; \ - br.ctop.dptk 1b; \ - ;; \ - br.cond.sptk.many .diff_align_do_tail; \ -2: \ -(EPI) st8 [dst1]=tmp,8; \ -(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ -3: \ -(p16) mov val1[1]=r0; \ -(p16) mov val1[0]=r0; \ - br.ctop.dptk 2b; \ - ;; \ - br.cond.sptk.many .failure_in2 - - // - // Since the instruction 'shrp' requires a fixed 128-bit value - // specifying the bits to shift, we need to provide 7 cases - // below. - // - SWITCH(p6, 8) - SWITCH(p7, 16) - SWITCH(p8, 24) - SWITCH(p9, 32) - SWITCH(p10, 40) - SWITCH(p11, 48) - SWITCH(p12, 56) - ;; - CASE(p6, 8) - CASE(p7, 16) - CASE(p8, 24) - CASE(p9, 32) - CASE(p10, 40) - CASE(p11, 48) - CASE(p12, 56) - ;; - BODY(8) - BODY(16) - BODY(24) - BODY(32) - BODY(40) - BODY(48) - BODY(56) - ;; -.diff_align_do_tail: - .pred.rel "mutex", p14, p15 -(p14) sub src1=src1,t1 -(p14) adds dst1=-8,dst1 -(p15) sub dst1=dst1,t1 - ;; -4: - // Tail correction. - // - // The problem with this piplelined loop is that the last word is not - // loaded and thus parf of the last word written is not correct. - // To fix that, we simply copy the tail byte by byte. - - sub len1=endsrc,src1,1 - clrrrb - ;; - mov ar.ec=PIPE_DEPTH - mov pr.rot=1<<16 // p16=true all others are false - mov ar.lc=len1 - ;; -5: - EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) - EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) - br.ctop.dptk.few 5b - ;; - mov ar.lc=saved_lc - mov pr=saved_pr,0xffffffffffff0000 - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - // - // Beginning of long mempcy (i.e. > 16 bytes) - // -.long_copy_user: - tbit.nz p6,p7=src1,0 // odd alignment - and tmp=7,tmp - ;; - cmp.eq p10,p8=r0,tmp - mov len1=len // copy because of rotation -(p8) br.cond.dpnt .diff_align_copy_user - ;; - // At this point we know we have more than 16 bytes to copy - // and also that both src and dest have the same alignment - // which may not be the one we want. So for now we must move - // forward slowly until we reach 16byte alignment: no need to - // worry about reaching the end of buffer. - // - EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned -(p6) adds len1=-1,len1;; - tbit.nz p7,p0=src1,1 - ;; - EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned -(p7) adds len1=-2,len1;; - tbit.nz p8,p0=src1,2 - ;; - // - // Stop bit not required after ld4 because if we fail on ld4 - // we have never executed the ld1, therefore st1 is not executed. - // - EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned - ;; - EX(.failure_out,(p6) st1 [dst1]=val1[0],1) - tbit.nz p9,p0=src1,3 - ;; - // - // Stop bit not required after ld8 because if we fail on ld8 - // we have never executed the ld2, therefore st2 is not executed. - // - EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned - EX(.failure_out,(p7) st2 [dst1]=val1[1],2) -(p8) adds len1=-4,len1 - ;; - EX(.failure_out, (p8) st4 [dst1]=val2[0],4) -(p9) adds len1=-8,len1;; - shr.u cnt=len1,4 // number of 128-bit (2x64bit) words - ;; - EX(.failure_out, (p9) st8 [dst1]=val2[1],8) - tbit.nz p6,p0=len1,3 - cmp.eq p7,p0=r0,cnt - adds tmp=-1,cnt // br.ctop is repeat/until -(p7) br.cond.dpnt .dotail // we have less than 16 bytes left - ;; - adds src2=8,src1 - adds dst2=8,dst1 - mov ar.lc=tmp - ;; - // - // 16bytes/iteration - // -2: - EX(.failure_in3,(p16) ld8 val1[0]=[src1],16) -(p16) ld8 val2[0]=[src2],16 - - EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16) -(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 - br.ctop.dptk 2b - ;; // RAW on src1 when fall through from loop - // - // Tail correction based on len only - // - // No matter where we come from (loop or test) the src1 pointer - // is 16 byte aligned AND we have less than 16 bytes to copy. - // -.dotail: - EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes - tbit.nz p7,p0=len1,2 - ;; - EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes - tbit.nz p8,p0=len1,1 - ;; - EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes - tbit.nz p9,p0=len1,0 - ;; - EX(.failure_out, (p6) st8 [dst1]=val1[0],8) - ;; - EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left - mov ar.lc=saved_lc - ;; - EX(.failure_out,(p7) st4 [dst1]=val1[1],4) - mov pr=saved_pr,0xffffffffffff0000 - ;; - EX(.failure_out, (p8) st2 [dst1]=val2[0],2) - mov ar.pfs=saved_pfs - ;; - EX(.failure_out, (p9) st1 [dst1]=val2[1]) - br.ret.sptk.many rp - - - // - // Here we handle the case where the byte by byte copy fails - // on the load. - // Several factors make the zeroing of the rest of the buffer kind of - // tricky: - // - the pipeline: loads/stores are not in sync (pipeline) - // - // In the same loop iteration, the dst1 pointer does not directly - // reflect where the faulty load was. - // - // - pipeline effect - // When you get a fault on load, you may have valid data from - // previous loads not yet store in transit. Such data must be - // store normally before moving onto zeroing the rest. - // - // - single/multi dispersal independence. - // - // solution: - // - we don't disrupt the pipeline, i.e. data in transit in - // the software pipeline will be eventually move to memory. - // We simply replace the load with a simple mov and keep the - // pipeline going. We can't really do this inline because - // p16 is always reset to 1 when lc > 0. - // -.failure_in_pipe1: - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied -1: -(p16) mov val1[0]=r0 -(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 - br.ctop.dptk 1b - ;; - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - // - // This is the case where the byte by byte copy fails on the load - // when we copy the head. We need to finish the pipeline and copy - // zeros for the rest of the destination. Since this happens - // at the top we still need to fill the body and tail. -.failure_in_pipe2: - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied -2: -(p16) mov val1[0]=r0 -(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 - br.ctop.dptk 2b - ;; - sub len=enddst,dst1,1 // precompute len - br.cond.dptk.many .failure_in1bis - ;; - - // - // Here we handle the head & tail part when we check for alignment. - // The following code handles only the load failures. The - // main diffculty comes from the fact that loads/stores are - // scheduled. So when you fail on a load, the stores corresponding - // to previous successful loads must be executed. - // - // However some simplifications are possible given the way - // things work. - // - // 1) HEAD - // Theory of operation: - // - // Page A | Page B - // ---------|----- - // 1|8 x - // 1 2|8 x - // 4|8 x - // 1 4|8 x - // 2 4|8 x - // 1 2 4|8 x - // |1 - // |2 x - // |4 x - // - // page_size >= 4k (2^12). (x means 4, 2, 1) - // Here we suppose Page A exists and Page B does not. - // - // As we move towards eight byte alignment we may encounter faults. - // The numbers on each page show the size of the load (current alignment). - // - // Key point: - // - if you fail on 1, 2, 4 then you have never executed any smaller - // size loads, e.g. failing ld4 means no ld1 nor ld2 executed - // before. - // - // This allows us to simplify the cleanup code, because basically you - // only have to worry about "pending" stores in the case of a failing - // ld8(). Given the way the code is written today, this means only - // worry about st2, st4. There we can use the information encapsulated - // into the predicates. - // - // Other key point: - // - if you fail on the ld8 in the head, it means you went straight - // to it, i.e. 8byte alignment within an unexisting page. - // Again this comes from the fact that if you crossed just for the ld8 then - // you are 8byte aligned but also 16byte align, therefore you would - // either go for the 16byte copy loop OR the ld8 in the tail part. - // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible - // because it would mean you had 15bytes to copy in which case you - // would have defaulted to the byte by byte copy. - // - // - // 2) TAIL - // Here we now we have less than 16 bytes AND we are either 8 or 16 byte - // aligned. - // - // Key point: - // This means that we either: - // - are right on a page boundary - // OR - // - are at more than 16 bytes from a page boundary with - // at most 15 bytes to copy: no chance of crossing. - // - // This allows us to assume that if we fail on a load we haven't possibly - // executed any of the previous (tail) ones, so we don't need to do - // any stores. For instance, if we fail on ld2, this means we had - // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4. - // - // This means that we are in a situation similar the a fault in the - // head part. That's nice! - // -.failure_in1: - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied - sub len=endsrc,src1,1 - // - // we know that ret0 can never be zero at this point - // because we failed why trying to do a load, i.e. there is still - // some work to do. - // The failure_in1bis and length problem is taken care of at the - // calling side. - // - ;; -.failure_in1bis: // from (.failure_in3) - mov ar.lc=len // Continue with a stupid byte store. - ;; -5: - st1 [dst1]=r0,1 - br.cloop.dptk 5b - ;; - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - // - // Here we simply restart the loop but instead - // of doing loads we fill the pipeline with zeroes - // We can't simply store r0 because we may have valid - // data in transit in the pipeline. - // ar.lc and ar.ec are setup correctly at this point - // - // we MUST use src1/endsrc here and not dst1/enddst because - // of the pipeline effect. - // -.failure_in3: - sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied - ;; -2: -(p16) mov val1[0]=r0 -(p16) mov val2[0]=r0 -(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16 -(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 - br.ctop.dptk 2b - ;; - cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? - sub len=enddst,dst1,1 // precompute len -(p6) br.cond.dptk .failure_in1bis - ;; - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - -.failure_in2: - sub ret0=endsrc,src1 - cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? - sub len=enddst,dst1,1 // precompute len -(p6) br.cond.dptk .failure_in1bis - ;; - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - // - // handling of failures on stores: that's the easy part - // -.failure_out: - sub ret0=enddst,dst1 - mov pr=saved_pr,0xffffffffffff0000 - mov ar.lc=saved_lc - - mov ar.pfs=saved_pfs - br.ret.sptk.many rp -END(__copy_user) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/csum_partial_copy.c --- a/xen/arch/ia64/linux/lib/csum_partial_copy.c Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,151 +0,0 @@ -/* - * Network Checksum & Copy routine - * - * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co - * Stephane Eranian <eranian@xxxxxxxxxx> - * - * Most of the code has been imported from Linux/Alpha - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/string.h> - -#include <asm/uaccess.h> - -/* - * XXX Fixme: those 2 inlines are meant for debugging and will go away - */ -static inline unsigned -short from64to16(unsigned long x) -{ - /* add up 32-bit words for 33 bits */ - x = (x & 0xffffffff) + (x >> 32); - /* add up 16-bit and 17-bit words for 17+c bits */ - x = (x & 0xffff) + (x >> 16); - /* add up 16-bit and 2-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -static inline -unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum) -{ - int odd, count; - unsigned long result = (unsigned long)psum; - - if (len <= 0) - goto out; - odd = 1 & (unsigned long) buff; - if (odd) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *) buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ - if (count) { - unsigned long carry = 0; - do { - unsigned long w = *(unsigned long *) buff; - count--; - buff += 8; - result += carry; - result += w; - carry = (w > result); - } while (count); - result += carry; - result = (result & 0xffffffff) + (result >> 32); - } - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - - result = from64to16(result); - - if (odd) - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - -out: - return result; -} - -/* - * XXX Fixme - * - * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS. - * But it's very tricky to get right even in C. - */ -extern unsigned long do_csum(const unsigned char *, long); - -static unsigned int -do_csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst, - int len, unsigned int psum, int *errp) -{ - unsigned long result; - - /* XXX Fixme - * for now we separate the copy from checksum for obvious - * alignment difficulties. Look at the Alpha code and you'll be - * scared. - */ - - if (__copy_from_user(dst, src, len) != 0 && errp) - *errp = -EFAULT; - - result = do_csum(dst, len); - - /* add in old sum, and carry.. */ - result += psum; - /* 32+c bits -> 32 bits */ - result = (result & 0xffffffff) + (result >> 32); - return result; -} - -unsigned int -csum_partial_copy_from_user (const unsigned char __user *src, unsigned char *dst, - int len, unsigned int sum, int *errp) -{ - if (!access_ok(VERIFY_READ, src, len)) { - *errp = -EFAULT; - memset(dst, 0, len); - return sum; - } - - return do_csum_partial_copy_from_user(src, dst, len, sum, errp); -} - -unsigned int -csum_partial_copy_nocheck(const unsigned char __user *src, unsigned char *dst, - int len, unsigned int sum) -{ - return do_csum_partial_copy_from_user(src, dst, len, sum, NULL); -} - -EXPORT_SYMBOL(csum_partial_copy_nocheck); diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/dec_and_lock.c --- a/xen/arch/ia64/linux/lib/dec_and_lock.c Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2003 Jerome Marchand, Bull S.A. - * Cleaned up by David Mosberger-Tang <davidm@xxxxxxxxxx> - * - * This file is released under the GPLv2, or at your option any later version. - * - * ia64 version of "atomic_dec_and_lock()" using the atomic "cmpxchg" instruction. This - * code is an adaptation of the x86 version of "atomic_dec_and_lock()". - */ - -#include <linux/compiler.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <asm/atomic.h> - -/* - * Decrement REFCOUNT and if the count reaches zero, acquire the spinlock. Both of these - * operations have to be done atomically, so that the count doesn't drop to zero without - * acquiring the spinlock first. - */ -int -_atomic_dec_and_lock (atomic_t *refcount, spinlock_t *lock) -{ - int old, new; - - do { - old = atomic_read(refcount); - new = old - 1; - - if (unlikely (old == 1)) { - /* oops, we may be decrementing to zero, do it the slow way... */ - spin_lock(lock); - if (atomic_dec_and_test(refcount)) - return 1; - spin_unlock(lock); - return 0; - } - } while (cmpxchg(&refcount->counter, old, new) != old); - return 0; -} - -EXPORT_SYMBOL(_atomic_dec_and_lock); diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/do_csum.S --- a/xen/arch/ia64/linux/lib/do_csum.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,323 +0,0 @@ -/* - * - * Optmized version of the standard do_csum() function - * - * Return: a 64bit quantity containing the 16bit Internet checksum - * - * Inputs: - * in0: address of buffer to checksum (char *) - * in1: length of the buffer (int) - * - * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co - * Stephane Eranian <eranian@xxxxxxxxxx> - * - * 02/04/22 Ken Chen <kenneth.w.chen@xxxxxxxxx> - * Data locality study on the checksum buffer. - * More optimization cleanup - remove excessive stop bits. - * 02/04/08 David Mosberger <davidm@xxxxxxxxxx> - * More cleanup and tuning. - * 01/04/18 Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Clean up and optimize and the software pipeline, loading two - * back-to-back 8-byte words per loop. Clean up the initialization - * for the loop. Support the cases where load latency = 1 or 2. - * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default). - */ - -#include <asm/asmmacro.h> - -// -// Theory of operations: -// The goal is to go as quickly as possible to the point where -// we can checksum 16 bytes/loop. Before reaching that point we must -// take care of incorrect alignment of first byte. -// -// The code hereafter also takes care of the "tail" part of the buffer -// before entering the core loop, if any. The checksum is a sum so it -// allows us to commute operations. So we do the "head" and "tail" -// first to finish at full speed in the body. Once we get the head and -// tail values, we feed them into the pipeline, very handy initialization. -// -// Of course we deal with the special case where the whole buffer fits -// into one 8 byte word. In this case we have only one entry in the pipeline. -// -// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for -// possible load latency and also to accommodate for head and tail. -// -// The end of the function deals with folding the checksum from 64bits -// down to 16bits taking care of the carry. -// -// This version avoids synchronization in the core loop by also using a -// pipeline for the accumulation of the checksum in resultx[] (x=1,2). -// -// wordx[] (x=1,2) -// |---| -// | | 0 : new value loaded in pipeline -// |---| -// | | - : in transit data -// |---| -// | | LOAD_LATENCY : current value to add to checksum -// |---| -// | | LOAD_LATENCY+1 : previous value added to checksum -// |---| (previous iteration) -// -// resultx[] (x=1,2) -// |---| -// | | 0 : initial value -// |---| -// | | LOAD_LATENCY-1 : new checksum -// |---| -// | | LOAD_LATENCY : previous value of checksum -// |---| -// | | LOAD_LATENCY+1 : final checksum when out of the loop -// |---| -// -// -// See RFC1071 "Computing the Internet Checksum" for various techniques for -// calculating the Internet checksum. -// -// NOT YET DONE: -// - Maybe another algorithm which would take care of the folding at the -// end in a different manner -// - Work with people more knowledgeable than me on the network stack -// to figure out if we could not split the function depending on the -// type of packet or alignment we get. Like the ip_fast_csum() routine -// where we know we have at least 20bytes worth of data to checksum. -// - Do a better job of handling small packets. -// - Note on prefetching: it was found that under various load, i.e. ftp read/write, -// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8% -// on the data that buffer points to (partly because the checksum is often preceded by -// a copy_from_user()). This finding indiate that lfetch will not be beneficial since -// the data is already in the cache. -// - -#define saved_pfs r11 -#define hmask r16 -#define tmask r17 -#define first1 r18 -#define firstval r19 -#define firstoff r20 -#define last r21 -#define lastval r22 -#define lastoff r23 -#define saved_lc r24 -#define saved_pr r25 -#define tmp1 r26 -#define tmp2 r27 -#define tmp3 r28 -#define carry1 r29 -#define carry2 r30 -#define first2 r31 - -#define buf in0 -#define len in1 - -#define LOAD_LATENCY 2 // XXX fix me - -#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2) -# error "Only 1 or 2 is supported/tested for LOAD_LATENCY." -#endif - -#define PIPE_DEPTH (LOAD_LATENCY+2) -#define ELD p[LOAD_LATENCY] // end of load -#define ELD_1 p[LOAD_LATENCY+1] // and next stage - -// unsigned long do_csum(unsigned char *buf,long len) - -GLOBAL_ENTRY(do_csum) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,2,16,0,16 - .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2] - .rotp p[PIPE_DEPTH], pC1[2], pC2[2] - mov ret0=r0 // in case we have zero length - cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len) - ;; - add tmp1=buf,len // last byte's address - .save pr, saved_pr - mov saved_pr=pr // preserve predicates (rotation) -(p6) br.ret.spnt.many rp // return if zero or negative length - - mov hmask=-1 // initialize head mask - tbit.nz p15,p0=buf,0 // is buf an odd address? - and first1=-8,buf // 8-byte align down address of first1 element - - and firstoff=7,buf // how many bytes off for first1 element - mov tmask=-1 // initialize tail mask - - ;; - adds tmp2=-1,tmp1 // last-1 - and lastoff=7,tmp1 // how many bytes off for last element - ;; - sub tmp1=8,lastoff // complement to lastoff - and last=-8,tmp2 // address of word containing last byte - ;; - sub tmp3=last,first1 // tmp3=distance from first1 to last - .save ar.lc, saved_lc - mov saved_lc=ar.lc // save lc - cmp.eq p8,p9=last,first1 // everything fits in one word ? - - ld8 firstval=[first1],8 // load, ahead of time, "first1" word - and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0 - shl tmp2=firstoff,3 // number of bits - ;; -(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed - shl tmp1=tmp1,3 // number of bits -(p9) adds tmp3=-8,tmp3 // effectively loaded - ;; -(p8) mov lastval=r0 // we don't need lastval if first1==last - shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[ - shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff] - ;; - .body -#define count tmp3 - -(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only -(p9) and word2[0]=lastval,tmask // mask last it as appropriate - shr.u count=count,3 // how many 8-byte? - ;; - // If count is odd, finish this 8-byte word so that we can - // load two back-to-back 8-byte words per loop thereafter. - and word1[0]=firstval,hmask // and mask it as appropriate - tbit.nz p10,p11=count,0 // if (count is odd) - ;; -(p8) mov result1[0]=word1[0] -(p9) add result1[0]=word1[0],word2[0] - ;; - cmp.ltu p6,p0=result1[0],word1[0] // check the carry - cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte - ;; -(p6) adds result1[0]=1,result1[0] -(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word) -(p11) br.cond.dptk .do_csum16 // if (count is even) - - // Here count is odd. - ld8 word1[1]=[first1],8 // load an 8-byte word - cmp.eq p9,p10=1,count // if (count == 1) - adds count=-1,count // loaded an 8-byte word - ;; - add result1[0]=result1[0],word1[1] - ;; - cmp.ltu p6,p0=result1[0],word1[1] - ;; -(p6) adds result1[0]=1,result1[0] -(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit - // Fall through to caluculate the checksum, feeding result1[0] as - // the initial value in result1[0]. - // - // Calculate the checksum loading two 8-byte words per loop. - // -.do_csum16: - add first2=8,first1 - shr.u count=count,1 // we do 16 bytes per loop - ;; - adds count=-1,count - mov carry1=r0 - mov carry2=r0 - brp.loop.imp 1f,2f - ;; - mov ar.ec=PIPE_DEPTH - mov ar.lc=count // set lc - mov pr.rot=1<<16 - // result1[0] must be initialized in advance. - mov result2[0]=r0 - ;; - .align 32 -1: -(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1] -(pC1[1])adds carry1=1,carry1 -(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1] -(pC2[1])adds carry2=1,carry2 -(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY] -(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY] -2: -(p[0]) ld8 word1[0]=[first1],16 -(p[0]) ld8 word2[0]=[first2],16 - br.ctop.sptk 1b - ;; - // Since len is a 32-bit value, carry cannot be larger than a 64-bit value. -(pC1[1])adds carry1=1,carry1 // since we miss the last one -(pC2[1])adds carry2=1,carry2 - ;; - add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1 - add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2 - ;; - cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1 - cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2 - ;; -(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1] -(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1] - ;; - add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1] - ;; - cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1] - ;; -(p6) adds result1[0]=1,result1[0] - ;; -.do_csum_exit: - // - // now fold 64 into 16 bits taking care of carry - // that's not very good because it has lots of sequentiality - // - mov tmp3=0xffff - zxt4 tmp1=result1[0] - shr.u tmp2=result1[0],32 - ;; - add result1[0]=tmp1,tmp2 - ;; - and tmp1=result1[0],tmp3 - shr.u tmp2=result1[0],16 - ;; - add result1[0]=tmp1,tmp2 - ;; - and tmp1=result1[0],tmp3 - shr.u tmp2=result1[0],16 - ;; - add result1[0]=tmp1,tmp2 - ;; - and tmp1=result1[0],tmp3 - shr.u tmp2=result1[0],16 - ;; - add ret0=tmp1,tmp2 - mov pr=saved_pr,0xffffffffffff0000 - ;; - // if buf was odd then swap bytes - mov ar.pfs=saved_pfs // restore ar.ec -(p15) mux1 ret0=ret0,@rev // reverse word - ;; - mov ar.lc=saved_lc -(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes - br.ret.sptk.many rp - -// I (Jun Nakajima) wrote an equivalent code (see below), but it was -// not much better than the original. So keep the original there so that -// someone else can challenge. -// -// shr.u word1[0]=result1[0],32 -// zxt4 result1[0]=result1[0] -// ;; -// add result1[0]=result1[0],word1[0] -// ;; -// zxt2 result2[0]=result1[0] -// extr.u word1[0]=result1[0],16,16 -// shr.u carry1=result1[0],32 -// ;; -// add result2[0]=result2[0],word1[0] -// ;; -// add result2[0]=result2[0],carry1 -// ;; -// extr.u ret0=result2[0],16,16 -// ;; -// add ret0=ret0,result2[0] -// ;; -// zxt2 ret0=ret0 -// mov ar.pfs=saved_pfs // restore ar.ec -// mov pr=saved_pr,0xffffffffffff0000 -// ;; -// // if buf was odd then swap bytes -// mov ar.lc=saved_lc -//(p15) mux1 ret0=ret0,@rev // reverse word -// ;; -//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes -// br.ret.sptk.many rp - -END(do_csum) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/io.c --- a/xen/arch/ia64/linux/lib/io.c Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,165 +0,0 @@ -#include <linux/config.h> -#include <linux/module.h> -#include <linux/types.h> - -#include <asm/io.h> - -/* - * Copy data from IO memory space to "real" memory space. - * This needs to be optimized. - */ -void memcpy_fromio(void *to, const volatile void __iomem *from, long count) -{ - char *dst = to; - - while (count) { - count--; - *dst++ = readb(from++); - } -} -EXPORT_SYMBOL(memcpy_fromio); - -/* - * Copy data from "real" memory space to IO memory space. - * This needs to be optimized. - */ -void memcpy_toio(volatile void __iomem *to, const void *from, long count) -{ - const char *src = from; - - while (count) { - count--; - writeb(*src++, to++); - } -} -EXPORT_SYMBOL(memcpy_toio); - -/* - * "memset" on IO memory space. - * This needs to be optimized. - */ -void memset_io(volatile void __iomem *dst, int c, long count) -{ - unsigned char ch = (char)(c & 0xff); - - while (count) { - count--; - writeb(ch, dst); - dst++; - } -} -EXPORT_SYMBOL(memset_io); - -#ifdef CONFIG_IA64_GENERIC - -#undef __ia64_inb -#undef __ia64_inw -#undef __ia64_inl -#undef __ia64_outb -#undef __ia64_outw -#undef __ia64_outl -#undef __ia64_readb -#undef __ia64_readw -#undef __ia64_readl -#undef __ia64_readq -#undef __ia64_readb_relaxed -#undef __ia64_readw_relaxed -#undef __ia64_readl_relaxed -#undef __ia64_readq_relaxed -#undef __ia64_writeb -#undef __ia64_writew -#undef __ia64_writel -#undef __ia64_writeq -#undef __ia64_mmiowb - -unsigned int -__ia64_inb (unsigned long port) -{ - return ___ia64_inb(port); -} - -unsigned int -__ia64_inw (unsigned long port) -{ - return ___ia64_inw(port); -} - -unsigned int -__ia64_inl (unsigned long port) -{ - return ___ia64_inl(port); -} - -void -__ia64_outb (unsigned char val, unsigned long port) -{ - ___ia64_outb(val, port); -} - -void -__ia64_outw (unsigned short val, unsigned long port) -{ - ___ia64_outw(val, port); -} - -void -__ia64_outl (unsigned int val, unsigned long port) -{ - ___ia64_outl(val, port); -} - -unsigned char -__ia64_readb (void __iomem *addr) -{ - return ___ia64_readb (addr); -} - -unsigned short -__ia64_readw (void __iomem *addr) -{ - return ___ia64_readw (addr); -} - -unsigned int -__ia64_readl (void __iomem *addr) -{ - return ___ia64_readl (addr); -} - -unsigned long -__ia64_readq (void __iomem *addr) -{ - return ___ia64_readq (addr); -} - -unsigned char -__ia64_readb_relaxed (void __iomem *addr) -{ - return ___ia64_readb (addr); -} - -unsigned short -__ia64_readw_relaxed (void __iomem *addr) -{ - return ___ia64_readw (addr); -} - -unsigned int -__ia64_readl_relaxed (void __iomem *addr) -{ - return ___ia64_readl (addr); -} - -unsigned long -__ia64_readq_relaxed (void __iomem *addr) -{ - return ___ia64_readq (addr); -} - -void -__ia64_mmiowb(void) -{ - ___ia64_mmiowb(); -} - -#endif /* CONFIG_IA64_GENERIC */ diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/ip_fast_csum.S --- a/xen/arch/ia64/linux/lib/ip_fast_csum.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,90 +0,0 @@ -/* - * Optmized version of the ip_fast_csum() function - * Used for calculating IP header checksum - * - * Return: 16bit checksum, complemented - * - * Inputs: - * in0: address of buffer to checksum (char *) - * in1: length of the buffer (int) - * - * Copyright (C) 2002 Intel Corp. - * Copyright (C) 2002 Ken Chen <kenneth.w.chen@xxxxxxxxx> - */ - -#include <asm/asmmacro.h> - -/* - * Since we know that most likely this function is called with buf aligned - * on 4-byte boundary and 20 bytes in length, we can execution rather quickly - * versus calling generic version of do_csum, which has lots of overhead in - * handling various alignments and sizes. However, due to lack of constrains - * put on the function input argument, cases with alignment not on 4-byte or - * size not equal to 20 bytes will be handled by the generic do_csum function. - */ - -#define in0 r32 -#define in1 r33 -#define ret0 r8 - -GLOBAL_ENTRY(ip_fast_csum) - .prologue - .body - cmp.ne p6,p7=5,in1 // size other than 20 byte? - and r14=3,in0 // is it aligned on 4-byte? - add r15=4,in0 // second source pointer - ;; - cmp.ne.or.andcm p6,p7=r14,r0 - ;; -(p7) ld4 r20=[in0],8 -(p7) ld4 r21=[r15],8 -(p6) br.spnt .generic - ;; - ld4 r22=[in0],8 - ld4 r23=[r15],8 - ;; - ld4 r24=[in0] - add r20=r20,r21 - add r22=r22,r23 - ;; - add r20=r20,r22 - ;; - add r20=r20,r24 - ;; - shr.u ret0=r20,16 // now need to add the carry - zxt2 r20=r20 - ;; - add r20=ret0,r20 - ;; - shr.u ret0=r20,16 // add carry again - zxt2 r20=r20 - ;; - add r20=ret0,r20 - ;; - shr.u ret0=r20,16 - zxt2 r20=r20 - ;; - add r20=ret0,r20 - ;; - andcm ret0=-1,r20 - .restore sp // reset frame state - br.ret.sptk.many b0 - ;; - -.generic: - .prologue - .save ar.pfs, r35 - alloc r35=ar.pfs,2,2,2,0 - .save rp, r34 - mov r34=b0 - .body - dep.z out1=in1,2,30 - mov out0=in0 - ;; - br.call.sptk.many b0=do_csum - ;; - andcm ret0=-1,ret0 - mov ar.pfs=r35 - mov b0=r34 - br.ret.sptk.many b0 -END(ip_fast_csum) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/memcpy.S --- a/xen/arch/ia64/linux/lib/memcpy.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,301 +0,0 @@ -/* - * - * Optimized version of the standard memcpy() function - * - * Inputs: - * in0: destination address - * in1: source address - * in2: number of bytes to copy - * Output: - * no return value - * - * Copyright (C) 2000-2001 Hewlett-Packard Co - * Stephane Eranian <eranian@xxxxxxxxxx> - * David Mosberger-Tang <davidm@xxxxxxxxxx> - */ -#include <asm/asmmacro.h> - -GLOBAL_ENTRY(memcpy) - -# define MEM_LAT 21 /* latency to memory */ - -# define dst r2 -# define src r3 -# define retval r8 -# define saved_pfs r9 -# define saved_lc r10 -# define saved_pr r11 -# define cnt r16 -# define src2 r17 -# define t0 r18 -# define t1 r19 -# define t2 r20 -# define t3 r21 -# define t4 r22 -# define src_end r23 - -# define N (MEM_LAT + 4) -# define Nrot ((N + 7) & ~7) - - /* - * First, check if everything (src, dst, len) is a multiple of eight. If - * so, we handle everything with no taken branches (other than the loop - * itself) and a small icache footprint. Otherwise, we jump off to - * the more general copy routine handling arbitrary - * sizes/alignment etc. - */ - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot - .save ar.lc, saved_lc - mov saved_lc=ar.lc - or t0=in0,in1 - ;; - - or t0=t0,in2 - .save pr, saved_pr - mov saved_pr=pr - - .body - - cmp.eq p6,p0=in2,r0 // zero length? - mov retval=in0 // return dst -(p6) br.ret.spnt.many rp // zero length, return immediately - ;; - - mov dst=in0 // copy because of rotation - shr.u cnt=in2,3 // number of 8-byte words to copy - mov pr.rot=1<<16 - ;; - - adds cnt=-1,cnt // br.ctop is repeat/until - cmp.gtu p7,p0=16,in2 // copying less than 16 bytes? - mov ar.ec=N - ;; - - and t0=0x7,t0 - mov ar.lc=cnt - ;; - cmp.ne p6,p0=t0,r0 - - mov src=in1 // copy because of rotation -(p7) br.cond.spnt.few .memcpy_short -(p6) br.cond.spnt.few .memcpy_long - ;; - nop.m 0 - ;; - nop.m 0 - nop.i 0 - ;; - nop.m 0 - ;; - .rotr val[N] - .rotp p[N] - .align 32 -1: { .mib -(p[0]) ld8 val[0]=[src],8 - nop.i 0 - brp.loop.imp 1b, 2f -} -2: { .mfb -(p[N-1])st8 [dst]=val[N-1],8 - nop.f 0 - br.ctop.dptk.few 1b -} - ;; - mov ar.lc=saved_lc - mov pr=saved_pr,-1 - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - /* - * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time - * copy loop. This performs relatively poorly on Itanium, but it doesn't - * get used very often (gcc inlines small copies) and due to atomicity - * issues, we want to avoid read-modify-write of entire words. - */ - .align 32 -.memcpy_short: - adds cnt=-1,in2 // br.ctop is repeat/until - mov ar.ec=MEM_LAT - brp.loop.imp 1f, 2f - ;; - mov ar.lc=cnt - ;; - nop.m 0 - ;; - nop.m 0 - nop.i 0 - ;; - nop.m 0 - ;; - nop.m 0 - ;; - /* - * It is faster to put a stop bit in the loop here because it makes - * the pipeline shorter (and latency is what matters on short copies). - */ - .align 32 -1: { .mib -(p[0]) ld1 val[0]=[src],1 - nop.i 0 - brp.loop.imp 1b, 2f -} ;; -2: { .mfb -(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1 - nop.f 0 - br.ctop.dptk.few 1b -} ;; - mov ar.lc=saved_lc - mov pr=saved_pr,-1 - mov ar.pfs=saved_pfs - br.ret.sptk.many rp - - /* - * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't - * an overriding concern here, but throughput is. We first do - * sub-word copying until the destination is aligned, then we check - * if the source is also aligned. If so, we do a simple load/store-loop - * until there are less than 8 bytes left over and then we do the tail, - * by storing the last few bytes using sub-word copying. If the source - * is not aligned, we branch off to the non-congruent loop. - * - * stage: op: - * 0 ld - * : - * MEM_LAT+3 shrp - * MEM_LAT+4 st - * - * On Itanium, the pipeline itself runs without stalls. However, br.ctop - * seems to introduce an unavoidable bubble in the pipeline so the overall - * latency is 2 cycles/iteration. This gives us a _copy_ throughput - * of 4 byte/cycle. Still not bad. - */ -# undef N -# undef Nrot -# define N (MEM_LAT + 5) /* number of stages */ -# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */ - -#define LOG_LOOP_SIZE 6 - -.memcpy_long: - alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame - and t0=-8,src // t0 = src & ~7 - and t2=7,src // t2 = src & 7 - ;; - ld8 t0=[t0] // t0 = 1st source word - adds src2=7,src // src2 = (src + 7) - sub t4=r0,dst // t4 = -dst - ;; - and src2=-8,src2 // src2 = (src + 7) & ~7 - shl t2=t2,3 // t2 = 8*(src & 7) - shl t4=t4,3 // t4 = 8*(dst & 7) - ;; - ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise - sub t3=64,t2 // t3 = 64-8*(src & 7) - shr.u t0=t0,t2 - ;; - add src_end=src,in2 - shl t1=t1,t3 - mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7) - ;; - or t0=t0,t1 - mov cnt=r0 - adds src_end=-1,src_end - ;; -(p3) st1 [dst]=t0,1 -(p3) shr.u t0=t0,8 -(p3) adds cnt=1,cnt - ;; -(p4) st2 [dst]=t0,2 -(p4) shr.u t0=t0,16 -(p4) adds cnt=2,cnt - ;; -(p5) st4 [dst]=t0,4 -(p5) adds cnt=4,cnt - and src_end=-8,src_end // src_end = last word of source buffer - ;; - - // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy: - -1:{ add src=cnt,src // make src point to remainder of source buffer - sub cnt=in2,cnt // cnt = number of bytes left to copy - mov t4=ip - } ;; - and src2=-8,src // align source pointer - adds t4=.memcpy_loops-1b,t4 - mov ar.ec=N - - and t0=7,src // t0 = src & 7 - shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy - shl cnt=cnt,3 // move bits 0-2 to 3-5 - ;; - - .rotr val[N+1], w[2] - .rotp p[N] - - cmp.ne p6,p0=t0,r0 // is src aligned, too? - shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7) - adds t2=-1,t2 // br.ctop is repeat/until - ;; - add t4=t0,t4 - mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy - mov ar.lc=t2 - ;; - nop.m 0 - ;; - nop.m 0 - nop.i 0 - ;; - nop.m 0 - ;; -(p6) ld8 val[1]=[src2],8 // prime the pump... - mov b6=t4 - br.sptk.few b6 - ;; - -.memcpy_tail: - // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is - // less than 8) and t0 contains the last few bytes of the src buffer: -(p5) st4 [dst]=t0,4 -(p5) shr.u t0=t0,32 - mov ar.lc=saved_lc - ;; -(p4) st2 [dst]=t0,2 -(p4) shr.u t0=t0,16 - mov ar.pfs=saved_pfs - ;; -(p3) st1 [dst]=t0 - mov pr=saved_pr,-1 - br.ret.sptk.many rp - -/////////////////////////////////////////////////////// - .align 64 - -#define COPY(shift,index) \ - 1: { .mib \ - (p[0]) ld8 val[0]=[src2],8; \ - (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \ - brp.loop.imp 1b, 2f \ - }; \ - 2: { .mfb \ - (p[MEM_LAT+4]) st8 [dst]=w[1],8; \ - nop.f 0; \ - br.ctop.dptk.few 1b; \ - }; \ - ;; \ - ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \ - ;; \ - shrp t0=val[N-1],val[N-index],shift; \ - br .memcpy_tail -.memcpy_loops: - COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */ - COPY(8, 0) - COPY(16, 0) - COPY(24, 0) - COPY(32, 0) - COPY(40, 0) - COPY(48, 0) - COPY(56, 0) - -END(memcpy) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strlen_user.S --- a/xen/arch/ia64/linux/lib/strlen_user.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,198 +0,0 @@ -/* - * Optimized version of the strlen_user() function - * - * Inputs: - * in0 address of buffer - * - * Outputs: - * ret0 0 in case of fault, strlen(buffer)+1 otherwise - * - * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co - * David Mosberger-Tang <davidm@xxxxxxxxxx> - * Stephane Eranian <eranian@xxxxxxxxxx> - * - * 01/19/99 S.Eranian heavily enhanced version (see details below) - * 09/24/99 S.Eranian added speculation recovery code - */ - -#include <asm/asmmacro.h> - -// -// int strlen_user(char *) -// ------------------------ -// Returns: -// - length of string + 1 -// - 0 in case an exception is raised -// -// This is an enhanced version of the basic strlen_user. it includes a -// combination of compute zero index (czx), parallel comparisons, speculative -// loads and loop unroll using rotating registers. -// -// General Ideas about the algorithm: -// The goal is to look at the string in chunks of 8 bytes. -// so we need to do a few extra checks at the beginning because the -// string may not be 8-byte aligned. In this case we load the 8byte -// quantity which includes the start of the string and mask the unused -// bytes with 0xff to avoid confusing czx. -// We use speculative loads and software pipelining to hide memory -// latency and do read ahead safely. This way we defer any exception. -// -// Because we don't want the kernel to be relying on particular -// settings of the DCR register, we provide recovery code in case -// speculation fails. The recovery code is going to "redo" the work using -// only normal loads. If we still get a fault then we return an -// error (ret0=0). Otherwise we return the strlen+1 as usual. -// The fact that speculation may fail can be caused, for instance, by -// the DCR.dm bit being set. In this case TLB misses are deferred, i.e., -// a NaT bit will be set if the translation is not present. The normal -// load, on the other hand, will cause the translation to be inserted -// if the mapping exists. -// -// It should be noted that we execute recovery code only when we need -// to use the data that has been speculatively loaded: we don't execute -// recovery code on pure read ahead data. -// -// Remarks: -// - the cmp r0,r0 is used as a fast way to initialize a predicate -// register to 1. This is required to make sure that we get the parallel -// compare correct. -// -// - we don't use the epilogue counter to exit the loop but we need to set -// it to zero beforehand. -// -// - after the loop we must test for Nat values because neither the -// czx nor cmp instruction raise a NaT consumption fault. We must be -// careful not to look too far for a Nat for which we don't care. -// For instance we don't need to look at a NaT in val2 if the zero byte -// was in val1. -// -// - Clearly performance tuning is required. -// - -#define saved_pfs r11 -#define tmp r10 -#define base r16 -#define orig r17 -#define saved_pr r18 -#define src r19 -#define mask r20 -#define val r21 -#define val1 r22 -#define val2 r23 - -GLOBAL_ENTRY(__strlen_user) - .prologue - .save ar.pfs, saved_pfs - alloc saved_pfs=ar.pfs,11,0,0,8 - - .rotr v[2], w[2] // declares our 4 aliases - - extr.u tmp=in0,0,3 // tmp=least significant 3 bits - mov orig=in0 // keep trackof initial byte address - dep src=0,in0,0,3 // src=8byte-aligned in0 address - .save pr, saved_pr - mov saved_pr=pr // preserve predicates (rotation) - ;; - - .body - - ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate) - shl tmp=tmp,3 // multiply by 8bits/byte - mov mask=-1 // our mask - ;; - ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline - cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and) - sub tmp=64,tmp // how many bits to shift our mask on the right - ;; - shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part - mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs) - ;; - add base=-16,src // keep track of aligned base - chk.s v[1], .recover // if already NaT, then directly skip to recover - or v[1]=v[1],mask // now we have a safe initial byte pattern - ;; -1: - ld8.s v[0]=[src],8 // speculatively load next - czx1.r val1=v[1] // search 0 byte from right - czx1.r val2=w[1] // search 0 byte from right following 8bytes - ;; - ld8.s w[0]=[src],8 // speculatively load next to next - cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8 - cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8 -(p6) br.wtop.dptk.few 1b // loop until p6 == 0 - ;; - // - // We must return try the recovery code iff - // val1_is_nat || (val1==8 && val2_is_nat) - // - // XXX Fixme - // - there must be a better way of doing the test - // - cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate) - tnat.nz p6,p7=val1 // test NaT on val1 -(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT - ;; - // - // if we come here p7 is true, i.e., initialized for // cmp - // - cmp.eq.and p7,p0=8,val1// val1==8? - tnat.nz.and p7,p0=val2 // test NaT if val2 -(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT - ;; -(p8) mov val1=val2 // val2 contains the value -(p8) adds src=-16,src // correct position when 3 ahead -(p9) adds src=-24,src // correct position when 4 ahead - ;; - sub ret0=src,orig // distance from origin - sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 - mov pr=saved_pr,0xffffffffffff0000 - ;; - sub ret0=ret0,tmp // length=now - back -1 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what - br.ret.sptk.many rp // end of normal execution - - // - // Outlined recovery code when speculation failed - // - // This time we don't use speculation and rely on the normal exception - // mechanism. that's why the loop is not as good as the previous one - // because read ahead is not possible - // - // XXX Fixme - // - today we restart from the beginning of the string instead - // of trying to continue where we left off. - // -.recover: - EX(.Lexit1, ld8 val=[base],8) // load the initial bytes - ;; - or val=val,mask // remask first bytes - cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop - ;; - // - // ar.ec is still zero here - // -2: - EX(.Lexit1, (p6) ld8 val=[base],8) - ;; - czx1.r val1=val // search 0 byte from right - ;; - cmp.eq p6,p0=8,val1 // val1==8 ? -(p6) br.wtop.dptk.few 2b // loop until p6 == 0 - ;; - sub ret0=base,orig // distance from base - sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1 - mov pr=saved_pr,0xffffffffffff0000 - ;; - sub ret0=ret0,tmp // length=now - back -1 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what - br.ret.sptk.many rp // end of successful recovery code - - // - // We failed even on the normal load (called from exception handler) - // -.Lexit1: - mov ret0=0 - mov pr=saved_pr,0xffffffffffff0000 - mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what - br.ret.sptk.many rp -END(__strlen_user) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strncpy_from_user.S --- a/xen/arch/ia64/linux/lib/strncpy_from_user.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,44 +0,0 @@ -/* - * Just like strncpy() except that if a fault occurs during copying, - * -EFAULT is returned. - * - * Inputs: - * in0: address of destination buffer - * in1: address of string to be copied - * in2: length of buffer in bytes - * Outputs: - * r8: -EFAULT in case of fault or number of bytes copied if no fault - * - * Copyright (C) 1998-2001 Hewlett-Packard Co - * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@xxxxxxxxxx> - * - * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by - * by Andreas Schwab <schwab@xxxxxxx>). - */ - -#include <asm/asmmacro.h> - -GLOBAL_ENTRY(__strncpy_from_user) - alloc r2=ar.pfs,3,0,0,0 - mov r8=0 - mov r9=in1 - ;; - add r10=in1,in2 - cmp.eq p6,p0=r0,in2 -(p6) br.ret.spnt.many rp - - // XXX braindead copy loop---this needs to be optimized -.Loop1: - EX(.Lexit, ld1 r8=[in1],1) - ;; - EX(.Lexit, st1 [in0]=r8,1) - cmp.ne p6,p7=r8,r0 - ;; -(p6) cmp.ne.unc p8,p0=in1,r10 -(p8) br.cond.dpnt.few .Loop1 - ;; -(p6) mov r8=in2 // buffer filled up---return buffer length -(p7) sub r8=in1,r9,1 // return string length (excluding NUL character) -[.Lexit:] - br.ret.sptk.many rp -END(__strncpy_from_user) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/strnlen_user.S --- a/xen/arch/ia64/linux/lib/strnlen_user.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,45 +0,0 @@ -/* - * Returns 0 if exception before NUL or reaching the supplied limit (N), - * a value greater than N if the string is longer than the limit, else - * strlen. - * - * Inputs: - * in0: address of buffer - * in1: string length limit N - * Outputs: - * r8: 0 in case of fault, strlen(buffer)+1 otherwise - * - * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@xxxxxxxxxx> - */ - -#include <asm/asmmacro.h> - -GLOBAL_ENTRY(__strnlen_user) - .prologue - alloc r2=ar.pfs,2,0,0,0 - .save ar.lc, r16 - mov r16=ar.lc // preserve ar.lc - - .body - - add r3=-1,in1 - ;; - mov ar.lc=r3 - mov r9=0 - ;; - // XXX braindead strlen loop---this needs to be optimized -.Loop1: - EXCLR(.Lexit, ld1 r8=[in0],1) - add r9=1,r9 - ;; - cmp.eq p6,p0=r8,r0 -(p6) br.cond.dpnt .Lexit - br.cloop.dptk.few .Loop1 - - add r9=1,in1 // NUL not found---return N+1 - ;; -.Lexit: - mov r8=r9 - mov ar.lc=r16 // restore ar.lc - br.ret.sptk.many rp -END(__strnlen_user) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/lib/xor.S --- a/xen/arch/ia64/linux/lib/xor.S Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,184 +0,0 @@ -/* - * arch/ia64/lib/xor.S - * - * Optimized RAID-5 checksumming functions for IA-64. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <asm/asmmacro.h> - -GLOBAL_ENTRY(xor_ia64_2) - .prologue - .fframe 0 - .save ar.pfs, r31 - alloc r31 = ar.pfs, 3, 0, 13, 16 - .save ar.lc, r30 - mov r30 = ar.lc - .save pr, r29 - mov r29 = pr - ;; - .body - mov r8 = in1 - mov ar.ec = 6 + 2 - shr in0 = in0, 3 - ;; - adds in0 = -1, in0 - mov r16 = in1 - mov r17 = in2 - ;; - mov ar.lc = in0 - mov pr.rot = 1 << 16 - ;; - .rotr s1[6+1], s2[6+1], d[2] - .rotp p[6+2] -0: -(p[0]) ld8.nta s1[0] = [r16], 8 -(p[0]) ld8.nta s2[0] = [r17], 8 -(p[6]) xor d[0] = s1[6], s2[6] -(p[6+1])st8.nta [r8] = d[1], 8 - nop.f 0 - br.ctop.dptk.few 0b - ;; - mov ar.lc = r30 - mov pr = r29, -1 - br.ret.sptk.few rp -END(xor_ia64_2) - -GLOBAL_ENTRY(xor_ia64_3) - .prologue - .fframe 0 - .save ar.pfs, r31 - alloc r31 = ar.pfs, 4, 0, 20, 24 - .save ar.lc, r30 - mov r30 = ar.lc - .save pr, r29 - mov r29 = pr - ;; - .body - mov r8 = in1 - mov ar.ec = 6 + 2 - shr in0 = in0, 3 - ;; - adds in0 = -1, in0 - mov r16 = in1 - mov r17 = in2 - ;; - mov r18 = in3 - mov ar.lc = in0 - mov pr.rot = 1 << 16 - ;; - .rotr s1[6+1], s2[6+1], s3[6+1], d[2] - .rotp p[6+2] -0: -(p[0]) ld8.nta s1[0] = [r16], 8 -(p[0]) ld8.nta s2[0] = [r17], 8 -(p[6]) xor d[0] = s1[6], s2[6] - ;; -(p[0]) ld8.nta s3[0] = [r18], 8 -(p[6+1])st8.nta [r8] = d[1], 8 -(p[6]) xor d[0] = d[0], s3[6] - br.ctop.dptk.few 0b - ;; - mov ar.lc = r30 - mov pr = r29, -1 - br.ret.sptk.few rp -END(xor_ia64_3) - -GLOBAL_ENTRY(xor_ia64_4) - .prologue - .fframe 0 - .save ar.pfs, r31 - alloc r31 = ar.pfs, 5, 0, 27, 32 - .save ar.lc, r30 - mov r30 = ar.lc - .save pr, r29 - mov r29 = pr - ;; - .body - mov r8 = in1 - mov ar.ec = 6 + 2 - shr in0 = in0, 3 - ;; - adds in0 = -1, in0 - mov r16 = in1 - mov r17 = in2 - ;; - mov r18 = in3 - mov ar.lc = in0 - mov pr.rot = 1 << 16 - mov r19 = in4 - ;; - .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2] - .rotp p[6+2] -0: -(p[0]) ld8.nta s1[0] = [r16], 8 -(p[0]) ld8.nta s2[0] = [r17], 8 -(p[6]) xor d[0] = s1[6], s2[6] -(p[0]) ld8.nta s3[0] = [r18], 8 -(p[0]) ld8.nta s4[0] = [r19], 8 -(p[6]) xor r20 = s3[6], s4[6] - ;; -(p[6+1])st8.nta [r8] = d[1], 8 -(p[6]) xor d[0] = d[0], r20 - br.ctop.dptk.few 0b - ;; - mov ar.lc = r30 - mov pr = r29, -1 - br.ret.sptk.few rp -END(xor_ia64_4) - -GLOBAL_ENTRY(xor_ia64_5) - .prologue - .fframe 0 - .save ar.pfs, r31 - alloc r31 = ar.pfs, 6, 0, 34, 40 - .save ar.lc, r30 - mov r30 = ar.lc - .save pr, r29 - mov r29 = pr - ;; - .body - mov r8 = in1 - mov ar.ec = 6 + 2 - shr in0 = in0, 3 - ;; - adds in0 = -1, in0 - mov r16 = in1 - mov r17 = in2 - ;; - mov r18 = in3 - mov ar.lc = in0 - mov pr.rot = 1 << 16 - mov r19 = in4 - mov r20 = in5 - ;; - .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2] - .rotp p[6+2] -0: -(p[0]) ld8.nta s1[0] = [r16], 8 -(p[0]) ld8.nta s2[0] = [r17], 8 -(p[6]) xor d[0] = s1[6], s2[6] -(p[0]) ld8.nta s3[0] = [r18], 8 -(p[0]) ld8.nta s4[0] = [r19], 8 -(p[6]) xor r21 = s3[6], s4[6] - ;; -(p[0]) ld8.nta s5[0] = [r20], 8 -(p[6+1])st8.nta [r8] = d[1], 8 -(p[6]) xor d[0] = d[0], r21 - ;; -(p[6]) xor d[0] = d[0], s5[6] - nop.f 0 - br.ctop.dptk.few 0b - ;; - mov ar.lc = r30 - mov pr = r29, -1 - br.ret.sptk.few rp -END(xor_ia64_5) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/linux/minstate.h --- a/xen/arch/ia64/linux/minstate.h Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,254 +0,0 @@ -#include <linux/config.h> - -#include <asm/cache.h> - -#include "entry.h" - -/* - * For ivt.s we want to access the stack virtually so we don't have to disable translation - * on interrupts. - * - * On entry: - * r1: pointer to current task (ar.k6) - */ -#define MINSTATE_START_SAVE_MIN_VIRT \ -(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ - ;; \ -(pUStk) mov.m r24=ar.rnat; \ -(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of RBS */ \ -(pKStk) mov r1=sp; /* get sp */ \ - ;; \ -(pUStk) lfetch.fault.excl.nt1 [r22]; \ -(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ -(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ - ;; \ -(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ - ;; \ -(pUStk) mov r18=ar.bsp; \ -(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ - -#define MINSTATE_END_SAVE_MIN_VIRT \ - bsw.1; /* switch back to bank 1 (must be last in insn group) */ \ - ;; - -/* - * For mca_asm.S we want to access the stack physically since the state is saved before we - * go virtual and don't want to destroy the iip or ipsr. - */ -#define MINSTATE_START_SAVE_MIN_PHYS \ -(pKStk) mov r3=IA64_KR(PER_CPU_DATA);; \ -(pKStk) addl r3=THIS_CPU(ia64_mca_data),r3;; \ -(pKStk) ld8 r3 = [r3];; \ -(pKStk) addl r3=IA64_MCA_CPU_INIT_STACK_OFFSET,r3;; \ -(pKStk) addl sp=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r3; \ -(pUStk) mov ar.rsc=0; /* set enforced lazy mode, pl 0, little-endian, loadrs=0 */ \ -(pUStk) addl r22=IA64_RBS_OFFSET,r1; /* compute base of register backing store */ \ - ;; \ -(pUStk) mov r24=ar.rnat; \ -(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ -(pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ -(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \ - ;; \ -(pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ -(pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ - ;; \ -(pUStk) mov r18=ar.bsp; \ -(pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ - -#define MINSTATE_END_SAVE_MIN_PHYS \ - dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \ - ;; - -#ifdef MINSTATE_VIRT -# define MINSTATE_GET_CURRENT(reg) \ - movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\ - ld8 reg=[reg] -# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT -# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT -#endif - -#ifdef MINSTATE_PHYS -# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; tpa reg=reg -# define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS -# define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS -#endif - -/* - * DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves - * the minimum state necessary that allows us to turn psr.ic back - * on. - * - * Assumed state upon entry: - * psr.ic: off - * r31: contains saved predicates (pr) - * - * Upon exit, the state is as follows: - * psr.ic: off - * r2 = points to &pt_regs.r16 - * r8 = contents of ar.ccv - * r9 = contents of ar.csd - * r10 = contents of ar.ssd - * r11 = FPSR_DEFAULT - * r12 = kernel sp (kernel virtual address) - * r13 = points to current task_struct (kernel virtual address) - * p15 = TRUE if psr.i is set in cr.ipsr - * predicate registers (other than p2, p3, and p15), b6, r3, r14, r15: - * preserved - * - * Note that psr.ic is NOT turned on by this macro. This is so that - * we can pass interruption state as arguments to a handler. - */ -#define DO_SAVE_MIN(COVER,SAVE_IFS,EXTRA) \ - MINSTATE_GET_CURRENT(r16); /* M (or M;;I) */ \ - mov r27=ar.rsc; /* M */ \ - mov r20=r1; /* A */ \ - mov r25=ar.unat; /* M */ \ - mov r29=cr.ipsr; /* M */ \ - mov r26=ar.pfs; /* I */ \ - mov r28=cr.iip; /* M */ \ - mov r21=ar.fpsr; /* M */ \ - COVER; /* B;; (or nothing) */ \ - ;; \ - adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16; \ - ;; \ - ld1 r17=[r16]; /* load current->thread.on_ustack flag */ \ - st1 [r16]=r0; /* clear current->thread.on_ustack flag */ \ - adds r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 \ - /* switch from user to kernel RBS: */ \ - ;; \ - invala; /* M */ \ - SAVE_IFS; \ - cmp.eq pKStk,pUStk=r0,r17; /* are we in kernel mode already? */ \ - ;; \ - MINSTATE_START_SAVE_MIN \ - adds r17=2*L1_CACHE_BYTES,r1; /* really: biggest cache-line size */ \ - adds r16=PT(CR_IPSR),r1; \ - ;; \ - lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES; \ - st8 [r16]=r29; /* save cr.ipsr */ \ - ;; \ - lfetch.fault.excl.nt1 [r17]; \ - tbit.nz p15,p0=r29,IA64_PSR_I_BIT; \ - mov r29=b0 \ - ;; \ - adds r16=PT(R8),r1; /* initialize first base pointer */ \ - adds r17=PT(R9),r1; /* initialize second base pointer */ \ -(pKStk) mov r18=r0; /* make sure r18 isn't NaT */ \ - ;; \ -.mem.offset 0,0; st8.spill [r16]=r8,16; \ -.mem.offset 8,0; st8.spill [r17]=r9,16; \ - ;; \ -.mem.offset 0,0; st8.spill [r16]=r10,24; \ -.mem.offset 8,0; st8.spill [r17]=r11,24; \ - ;; \ - st8 [r16]=r28,16; /* save cr.iip */ \ - st8 [r17]=r30,16; /* save cr.ifs */ \ -(pUStk) sub r18=r18,r22; /* r18=RSE.ndirty*8 */ \ - mov r8=ar.ccv; \ - mov r9=ar.csd; \ - mov r10=ar.ssd; \ - movl r11=FPSR_DEFAULT; /* L-unit */ \ - ;; \ - st8 [r16]=r25,16; /* save ar.unat */ \ - st8 [r17]=r26,16; /* save ar.pfs */ \ - shl r18=r18,16; /* compute ar.rsc to be used for "loadrs" */ \ - ;; \ - st8 [r16]=r27,16; /* save ar.rsc */ \ -(pUStk) st8 [r17]=r24,16; /* save ar.rnat */ \ -(pKStk) adds r17=16,r17; /* skip over ar_rnat field */ \ - ;; /* avoid RAW on r16 & r17 */ \ -(pUStk) st8 [r16]=r23,16; /* save ar.bspstore */ \ - st8 [r17]=r31,16; /* save predicates */ \ -(pKStk) adds r16=16,r16; /* skip over ar_bspstore field */ \ - ;; \ - st8 [r16]=r29,16; /* save b0 */ \ - st8 [r17]=r18,16; /* save ar.rsc value for "loadrs" */ \ - cmp.eq pNonSys,pSys=r0,r0 /* initialize pSys=0, pNonSys=1 */ \ - ;; \ -.mem.offset 0,0; st8.spill [r16]=r20,16; /* save original r1 */ \ -.mem.offset 8,0; st8.spill [r17]=r12,16; \ - adds r12=-16,r1; /* switch to kernel memory stack (with 16 bytes of scratch) */ \ - ;; \ -.mem.offset 0,0; st8.spill [r16]=r13,16; \ -.mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \ - movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; \ - ld8 r13=[r13]; /* establish 'current' */ \ - ;; \ -.mem.offset 0,0; st8.spill [r16]=r15,16; \ -.mem.offset 8,0; st8.spill [r17]=r14,16; \ - ;; \ -.mem.offset 0,0; st8.spill [r16]=r2,16; \ -.mem.offset 8,0; st8.spill [r17]=r3,16; \ - adds r2=IA64_PT_REGS_R16_OFFSET,r1; \ - ;; \ - EXTRA; \ - movl r1=__gp; /* establish kernel global pointer */ \ - ;; \ - MINSTATE_END_SAVE_MIN - -/* - * SAVE_REST saves the remainder of pt_regs (with psr.ic on). - * - * Assumed state upon entry: - * psr.ic: on - * r2: points to &pt_regs.r16 - * r3: points to &pt_regs.r17 - * r8: contents of ar.ccv - * r9: contents of ar.csd - * r10: contents of ar.ssd - * r11: FPSR_DEFAULT - * - * Registers r14 and r15 are guaranteed not to be touched by SAVE_REST. - */ -#define SAVE_REST \ -.mem.offset 0,0; st8.spill [r2]=r16,16; \ -.mem.offset 8,0; st8.spill [r3]=r17,16; \ - ;; \ -.mem.offset 0,0; st8.spill [r2]=r18,16; \ -.mem.offset 8,0; st8.spill [r3]=r19,16; \ - ;; \ -.mem.offset 0,0; st8.spill [r2]=r20,16; \ -.mem.offset 8,0; st8.spill [r3]=r21,16; \ - mov r18=b6; \ - ;; \ -.mem.offset 0,0; st8.spill [r2]=r22,16; \ -.mem.offset 8,0; st8.spill [r3]=r23,16; \ - mov r19=b7; \ - ;; \ -.mem.offset 0,0; st8.spill [r2]=r24,16; \ -.mem.offset 8,0; st8.spill [r3]=r25,16; \ - ;; \ -.mem.offset 0,0; st8.spill [r2]=r26,16; \ -.mem.offset 8,0; st8.spill [r3]=r27,16; \ - ;; \ -.mem.offset 0,0; st8.spill [r2]=r28,16; \ -.mem.offset 8,0; st8.spill [r3]=r29,16; \ - ;; \ -.mem.offset 0,0; st8.spill [r2]=r30,16; \ -.mem.offset 8,0; st8.spill [r3]=r31,32; \ - ;; \ - mov ar.fpsr=r11; /* M-unit */ \ - st8 [r2]=r8,8; /* ar.ccv */ \ - adds r24=PT(B6)-PT(F7),r3; \ - ;; \ - stf.spill [r2]=f6,32; \ - stf.spill [r3]=f7,32; \ - ;; \ - stf.spill [r2]=f8,32; \ - stf.spill [r3]=f9,32; \ - ;; \ - stf.spill [r2]=f10; \ - stf.spill [r3]=f11; \ - adds r25=PT(B7)-PT(F11),r3; \ - ;; \ - st8 [r24]=r18,16; /* b6 */ \ - st8 [r25]=r19,16; /* b7 */ \ - ;; \ - st8 [r24]=r9; /* ar.csd */ \ - st8 [r25]=r10; /* ar.ssd */ \ - ;; - -#define SAVE_MIN_WITH_COVER DO_SAVE_MIN(cover, mov r30=cr.ifs,) -#define SAVE_MIN_WITH_COVER_R19 DO_SAVE_MIN(cover, mov r30=cr.ifs, mov r15=r19) -#define SAVE_MIN DO_SAVE_MIN( , mov r30=r0, ) diff -r 44316ce83277 -r b7276814008c xen/arch/ia64/pdb-stub.c --- a/xen/arch/ia64/pdb-stub.c Tue Aug 30 23:51:51 2005 +++ /dev/null Wed Aug 31 20:32:27 2005 @@ -1,59 +0,0 @@ - -/* - * pervasive debugger - * www.cl.cam.ac.uk/netos/pdb - * - * alex ho - * 2004 - * university of cambridge computer laboratory - * - * code adapted originally from kgdb, nemesis, & gdbserver - */ - -#include <xen/lib.h> -#include <xen/sched.h> -#include <asm/ptrace.h> -#include <xen/keyhandler.h> -#include <asm/processor.h> -#include <asm/pdb.h> -#include <xen/list.h> -#include <xen/serial.h> - -#define __PDB_GET_VAL 1 -#define __PDB_SET_VAL 2 - -/* - * Read or write memory in an address space - */ -int pdb_change_values(u_char *buffer, int length, - unsigned long cr3, unsigned long addr, int rw) -{ - dummy(); - return 0; -} - -/* - * Set memory in a domain's address space - * Set "length" bytes at "address" from "domain" to the values in "buffer". - * Return the number of bytes set, 0 if there was a problem. - */ - -int pdb_set_values(u_char *buffer, int length, - unsigned long cr3, unsigned long addr) -{ - int count = pdb_change_values(buffer, length, cr3, addr, __PDB_SET_VAL); - return count; -} - -/* - * Read memory from a domain's address space. - * Fetch "length" bytes at "address" from "domain" into "buffer". - * Return the number of bytes read, 0 if there was a problem. - */ - -int pdb_get_values(u_char *buffer, int length, - unsigned long cr3, unsigned long addr) -{ - return pdb_change_values(buffer, length, cr3, addr, __PDB_GET_VAL); -} - _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |