diff --git a/asm-opt.c b/asm-opt.c index 6fa5404..fd885a5 100644 --- a/asm-opt.c +++ b/asm-opt.c @@ -144,6 +144,15 @@ static bench_info x86_sse2[] = { NULL, 0, NULL } }; +static bench_info x86_sse2_fb[] = +{ + { "MOVSD copy (from framebuffer)", 0, aligned_block_copy_movsd }, + { "MOVSD 2-pass copy (from framebuffer)", 1, aligned_block_copy_movsd }, + { "SSE2 copy (from framebuffer)", 0, aligned_block_copy_sse2 }, + { "SSE2 2-pass copy (from framebuffer)", 1, aligned_block_copy_sse2 }, + { NULL, 0, NULL } +}; + static int check_sse2_support(void) { #ifdef __amd64__ @@ -185,6 +194,14 @@ bench_info *get_asm_benchmarks(void) return empty; } +bench_info *get_asm_framebuffer_benchmarks(void) +{ + if (check_sse2_support()) + return x86_sse2_fb; + else + return empty; +} + #elif defined(__arm__) #include "arm-neon.h" @@ -271,6 +288,48 @@ bench_info *get_asm_benchmarks(void) return arm_v4; } +static bench_info arm_neon_fb[] = +{ + { "NEON read (from framebuffer)", 0, aligned_block_read_neon }, + { "NEON copy (from framebuffer)", 0, aligned_block_copy_neon }, + { "NEON 2-pass copy (from framebuffer)", 1, aligned_block_copy_neon }, + { "NEON unrolled copy (from framebuffer)", 0, aligned_block_copy_unrolled_neon }, + { "NEON 2-pass unrolled copy (from framebuffer)", 1, aligned_block_copy_unrolled_neon }, + { "VFP copy (from framebuffer)", 0, aligned_block_copy_vfp }, + { "VFP 2-pass copy (from framebuffer)", 1, aligned_block_copy_vfp }, + { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te }, + { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te }, + { NULL, 0, NULL } +}; + +static bench_info arm_v5te_vfp_fb[] = +{ + { "VFP copy (from framebuffer)", 0, aligned_block_copy_vfp }, + { "VFP 2-pass copy (from framebuffer)", 1, aligned_block_copy_vfp }, + { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te }, + { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te }, + { NULL, 0, NULL } +}; + +static bench_info arm_v5te_fb[] = +{ + { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te }, + { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te }, + { NULL, 0, NULL } +}; + +bench_info *get_asm_framebuffer_benchmarks(void) +{ + if (check_cpu_feature("neon")) + return arm_neon_fb; + else if (check_cpu_feature("edsp") && check_cpu_feature("vfp")) + return arm_v5te_vfp_fb; + else if (check_cpu_feature("edsp")) + return arm_v5te_fb; + else + return empty; +} + #elif defined(__mips__) && defined(_ABIO32) #include "mips-32.h" @@ -307,6 +366,11 @@ bench_info *get_asm_benchmarks(void) } } +bench_info *get_asm_framebuffer_benchmarks(void) +{ + return empty; +} + #else bench_info *get_asm_benchmarks(void) @@ -314,4 +378,9 @@ bench_info *get_asm_benchmarks(void) return empty; } +bench_info *get_asm_framebuffer_benchmarks(void) +{ + return empty; +} + #endif diff --git a/asm-opt.h b/asm-opt.h index 722fa00..caf751b 100644 --- a/asm-opt.h +++ b/asm-opt.h @@ -34,5 +34,6 @@ typedef struct } bench_info; bench_info *get_asm_benchmarks(void); +bench_info *get_asm_framebuffer_benchmarks(void); #endif diff --git a/main.c b/main.c index 80066a1..53f4c7e 100644 --- a/main.c +++ b/main.c @@ -43,7 +43,7 @@ #define BLOCKSIZE 2048 #define MAXREPEATS 10 -#ifdef BENCH_FRAMEBUFFER +#ifdef __linux__ static void *mmap_framebuffer(size_t *fbsize) { int fd; @@ -475,7 +475,7 @@ int main(void) int64_t *srcbuf, *dstbuf, *tmpbuf; void *poolbuf; size_t bufsize = SIZE; -#ifdef BENCH_FRAMEBUFFER +#ifdef __linux__ size_t fbsize; int64_t *fbbuf = mmap_framebuffer(&fbsize); fbsize = (fbsize / BLOCKSIZE) * BLOCKSIZE; @@ -488,16 +488,6 @@ int main(void) (void **)&dstbuf, bufsize, (void **)&tmpbuf, BLOCKSIZE, NULL, 0); -#ifdef BENCH_FRAMEBUFFER - if (fbbuf) - { - printf("(*) using framebuffer as the source buffer (size=%d)\n", (int)fbsize); - srcbuf = fbbuf; - if (bufsize > fbsize) - bufsize = fbsize; - } -#endif - printf("\n"); printf("==========================================================================\n"); printf("== Memory bandwidth tests ==\n"); @@ -521,6 +511,39 @@ int main(void) printf(" ---\n"); bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", bi); } + +#ifdef __linux__ + bi = get_asm_framebuffer_benchmarks(); + if (bi->f && fbbuf) + { + printf("\n"); + printf("==========================================================================\n"); + printf("== Framebuffer read tests. ==\n"); + printf("== ==\n"); + printf("== Many ARM devices use a part of the system memory as the framebuffer, ==\n"); + printf("== typically mapped as uncached but with write-combining enabled. ==\n"); + printf("== Writes to such framebuffers are quite fast, but reads are much ==\n"); + printf("== slower and very sensitive to the alignment and the selection of ==\n"); + printf("== CPU instructions which are used for accessing memory. ==\n"); + printf("== ==\n"); + printf("== Many x86 systems allocate the framebuffer in the GPU memory, ==\n"); + printf("== accessible for the CPU via a relatively slow PCI-E bus. Moreover, ==\n"); + printf("== PCI-E is asymmetric and handles reads a lot worse than writes. ==\n"); + printf("== ==\n"); + printf("== If uncached framebuffer reads are reasonably fast (at least 100 MB/s ==\n"); + printf("== or preferably >300 MB/s), then using the shadow framebuffer layer ==\n"); + printf("== is not necessary in Xorg DDX drivers, resulting in a nice overall ==\n"); + printf("== performance improvement. For example, the xf86-video-fbturbo DDX ==\n"); + printf("== uses this trick. ==\n"); + printf("==========================================================================\n\n"); + + srcbuf = fbbuf; + if (bufsize > fbsize) + bufsize = fbsize; + bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", bi); + } +#endif + free(poolbuf); printf("\n");