[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [XenPPC] copy_page speedup using dcbz on target
Using dcbz avoids first reading a cache line from memory before writing to the line. Timing results (starting with clean cache, ie no write-backs for dirty lines): JS20: elapsed time: 0x0000000000009f5e elapsed time using dcbz: 0x000000000000569e elapsed time: 0x0000000000009fe9 elapsed time using dcbz: 0x0000000000005765 JS21: elapsed time: 0x000000000000089e elapsed time using dcbz: 0x0000000000000439 elapsed time: 0x0000000000000886 elapsed time using dcbz: 0x0000000000000438 ......................................... #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> typedef unsigned char uchar; typedef unsigned long ulong; #define LINE_SIZE 128 #define PAGE_SIZE 0x1000 #define BUF1_SIZE (PAGE_SIZE * 64) #define BUF2_SIZE (PAGE_SIZE) #define BUF3_SIZE (0x800000) static __inline__ ulong time_base(void); static __inline__ void copy_page(void *dp, void *sp); static __inline__ void cacheable_copy_page(void *dp, void *sp); static __inline__ void cacheable_clear_page(void *addr); static uchar clean_cache(uchar *buf3); int main(int argc, char **argv){ int i; ulong tb1, tb2; uchar *buf1, *buf2, *buf3, *bufp; buf1 = malloc(BUF1_SIZE + PAGE_SIZE); buf2 = malloc(BUF2_SIZE + PAGE_SIZE); buf3 = malloc(BUF3_SIZE + PAGE_SIZE); buf1 = (uchar *)((ulong)(buf1 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1)); buf2 = (uchar *)((ulong)(buf2 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1)); buf3 = (uchar *)((ulong)(buf3 + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1)); memset(buf1, 1, BUF1_SIZE); memset(buf2, 2, BUF2_SIZE); memset(buf3, 3, BUF3_SIZE); clean_cache(buf3); tb1 = time_base(); for (bufp = buf1, i = 0; i < 4; i++, bufp += PAGE_SIZE*16){ copy_page(bufp, buf2); copy_page(bufp+(PAGE_SIZE*1), buf2); copy_page(bufp+(PAGE_SIZE*2), buf2); copy_page(bufp+(PAGE_SIZE*3), buf2); copy_page(bufp+(PAGE_SIZE*4), buf2); copy_page(bufp+(PAGE_SIZE*5), buf2); copy_page(bufp+(PAGE_SIZE*6), buf2); copy_page(bufp+(PAGE_SIZE*7), buf2); copy_page(bufp+(PAGE_SIZE*8), buf2); copy_page(bufp+(PAGE_SIZE*9), buf2); copy_page(bufp+(PAGE_SIZE*10), buf2); copy_page(bufp+(PAGE_SIZE*11), buf2); copy_page(bufp+(PAGE_SIZE*12), buf2); copy_page(bufp+(PAGE_SIZE*13), buf2); copy_page(bufp+(PAGE_SIZE*14), buf2); copy_page(bufp+(PAGE_SIZE*15), buf2); } tb2 = time_base(); printf("elapsed time: 0x%016lx\n", tb2 - tb1); clean_cache(buf3); tb1 = time_base(); for (bufp = buf1, i = 0; i < 4; i++, bufp += PAGE_SIZE*16){ cacheable_copy_page(bufp, buf2); cacheable_copy_page(bufp+(PAGE_SIZE*1), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*2), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*3), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*4), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*5), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*6), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*7), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*8), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*9), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*10), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*11), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*12), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*13), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*14), buf2); cacheable_copy_page(bufp+(PAGE_SIZE*15), buf2); } tb2 = time_base(); printf("elapsed time using dcbz: 0x%016lx\n", tb2 - tb1); return(0); } static __inline__ ulong time_base(void) { ulong tb; __asm__ __volatile__( "mftb %0 # read time base" : "=r" (tb)); return tb; } static __inline__ void cacheable_clear_page(void *addr) { ulong lines, line_size; line_size = LINE_SIZE; lines = PAGE_SIZE / line_size; __asm__ __volatile__( "mtctr %1 # clear_page\n\ 1: dcbz 0,%0\n\ add %0,%0,%3\n\ bdnz 1b" : "=r" (addr) : "r" (lines), "0" (addr), "r" (line_size) : "%ctr", "memory"); } static __inline__ void copy_page(void *dp, void *sp) { ulong dwords, dword_size; dword_size = 8; dwords = (PAGE_SIZE / dword_size) - 1; __asm__ __volatile__( "mtctr %2 # copy_page\n\ ld %2,0(%1)\n\ std %2,0(%0)\n\ 1: ldu %2,8(%1)\n\ stdu %2,8(%0)\n\ bdnz 1b" : /* no result */ : "r" (dp), "r" (sp), "r" (dwords) : "%ctr", "memory"); } static __inline__ void cacheable_copy_page(void *dp, void *sp) { cacheable_clear_page(dp); copy_page(dp, sp); } static uchar clean_cache(uchar *buf3) { int i; uchar uc, *ucp = buf3; for (i = 0; i < BUF3_SIZE / LINE_SIZE; i++){ uc += *ucp; ucp += LINE_SIZE; } return(uc); } _______________________________________________ Xen-ppc-devel mailing list Xen-ppc-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-ppc-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |