diff --git a/asm-opt.c b/asm-opt.c
index 6fa5404..fd885a5 100644
--- a/asm-opt.c
+++ b/asm-opt.c
@@ -144,6 +144,15 @@ static bench_info x86_sse2[] =
     { NULL, 0, NULL }
 };
 
+static bench_info x86_sse2_fb[] =
+{
+    { "MOVSD copy (from framebuffer)", 0, aligned_block_copy_movsd },
+    { "MOVSD 2-pass copy (from framebuffer)", 1, aligned_block_copy_movsd },
+    { "SSE2 copy (from framebuffer)", 0, aligned_block_copy_sse2 },
+    { "SSE2 2-pass copy (from framebuffer)", 1, aligned_block_copy_sse2 },
+    { NULL, 0, NULL }
+};
+
 static int check_sse2_support(void)
 {
 #ifdef __amd64__
@@ -185,6 +194,14 @@ bench_info *get_asm_benchmarks(void)
         return empty;
 }
 
+bench_info *get_asm_framebuffer_benchmarks(void)
+{
+    if (check_sse2_support())
+        return x86_sse2_fb;
+    else
+        return empty;
+}
+
 #elif defined(__arm__)
 
 #include "arm-neon.h"
@@ -271,6 +288,48 @@ bench_info *get_asm_benchmarks(void)
         return arm_v4;
 }
 
+static bench_info arm_neon_fb[] =
+{
+    { "NEON read (from framebuffer)", 0, aligned_block_read_neon },
+    { "NEON copy (from framebuffer)", 0, aligned_block_copy_neon },
+    { "NEON 2-pass copy (from framebuffer)", 1, aligned_block_copy_neon },
+    { "NEON unrolled copy (from framebuffer)", 0, aligned_block_copy_unrolled_neon },
+    { "NEON 2-pass unrolled copy (from framebuffer)", 1, aligned_block_copy_unrolled_neon },
+    { "VFP copy (from framebuffer)", 0, aligned_block_copy_vfp },
+    { "VFP 2-pass copy (from framebuffer)", 1, aligned_block_copy_vfp },
+    { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te },
+    { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te },
+    { NULL, 0, NULL }
+};
+
+static bench_info arm_v5te_vfp_fb[] =
+{
+    { "VFP copy (from framebuffer)", 0, aligned_block_copy_vfp },
+    { "VFP 2-pass copy (from framebuffer)", 1, aligned_block_copy_vfp },
+    { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te },
+    { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te },
+    { NULL, 0, NULL }
+};
+
+static bench_info arm_v5te_fb[] =
+{
+    { "ARM copy (from framebuffer)", 0, aligned_block_copy_incr_armv5te },
+    { "ARM 2-pass copy (from framebuffer)", 1, aligned_block_copy_incr_armv5te },
+    { NULL, 0, NULL }
+};
+
+bench_info *get_asm_framebuffer_benchmarks(void)
+{
+    if (check_cpu_feature("neon"))
+        return arm_neon_fb;
+    else if (check_cpu_feature("edsp") && check_cpu_feature("vfp"))
+        return arm_v5te_vfp_fb;
+    else if (check_cpu_feature("edsp"))
+        return arm_v5te_fb;
+    else
+        return empty;
+}
+
 #elif defined(__mips__) && defined(_ABIO32)
 
 #include "mips-32.h"
@@ -307,6 +366,11 @@ bench_info *get_asm_benchmarks(void)
     }
 }
 
+bench_info *get_asm_framebuffer_benchmarks(void)
+{
+    return empty;
+}
+
 #else
 
 bench_info *get_asm_benchmarks(void)
@@ -314,4 +378,9 @@ bench_info *get_asm_benchmarks(void)
     return empty;
 }
 
+bench_info *get_asm_framebuffer_benchmarks(void)
+{
+    return empty;
+}
+
 #endif
diff --git a/asm-opt.h b/asm-opt.h
index 722fa00..caf751b 100644
--- a/asm-opt.h
+++ b/asm-opt.h
@@ -34,5 +34,6 @@ typedef struct
 } bench_info;
 
 bench_info *get_asm_benchmarks(void);
+bench_info *get_asm_framebuffer_benchmarks(void);
 
 #endif
diff --git a/main.c b/main.c
index 80066a1..53f4c7e 100644
--- a/main.c
+++ b/main.c
@@ -43,7 +43,7 @@
 #define BLOCKSIZE        2048
 #define MAXREPEATS       10
 
-#ifdef BENCH_FRAMEBUFFER
+#ifdef __linux__
 static void *mmap_framebuffer(size_t *fbsize)
 {
     int fd;
@@ -475,7 +475,7 @@ int main(void)
     int64_t *srcbuf, *dstbuf, *tmpbuf;
     void *poolbuf;
     size_t bufsize = SIZE;
-#ifdef BENCH_FRAMEBUFFER
+#ifdef __linux__
     size_t fbsize;
     int64_t *fbbuf = mmap_framebuffer(&fbsize);
     fbsize = (fbsize / BLOCKSIZE) * BLOCKSIZE;
@@ -488,16 +488,6 @@ int main(void)
                                             (void **)&dstbuf, bufsize,
                                             (void **)&tmpbuf, BLOCKSIZE,
                                             NULL, 0);
-#ifdef BENCH_FRAMEBUFFER
-    if (fbbuf)
-    {
-        printf("(*) using framebuffer as the source buffer (size=%d)\n", (int)fbsize);
-        srcbuf = fbbuf;
-        if (bufsize > fbsize)
-            bufsize = fbsize;
-    }
-#endif
-
     printf("\n");
     printf("==========================================================================\n");
     printf("== Memory bandwidth tests                                               ==\n");
@@ -521,6 +511,39 @@ int main(void)
         printf(" ---\n");
         bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", bi);
     }
+
+#ifdef __linux__
+    bi = get_asm_framebuffer_benchmarks();
+    if (bi->f && fbbuf)
+    {
+        printf("\n");
+        printf("==========================================================================\n");
+        printf("== Framebuffer read tests.                                              ==\n");
+        printf("==                                                                      ==\n");
+        printf("== Many ARM devices use a part of the system memory as the framebuffer, ==\n");
+        printf("== typically mapped as uncached but with write-combining enabled.       ==\n");
+        printf("== Writes to such framebuffers are quite fast, but reads are much       ==\n");
+        printf("== slower and very sensitive to the alignment and the selection of      ==\n");
+        printf("== CPU instructions which are used for accessing memory.                ==\n");
+        printf("==                                                                      ==\n");
+        printf("== Many x86 systems allocate the framebuffer in the GPU memory,         ==\n");
+        printf("== accessible for the CPU via a relatively slow PCI-E bus. Moreover,    ==\n");
+        printf("== PCI-E is asymmetric and handles reads a lot worse than writes.       ==\n");
+        printf("==                                                                      ==\n");
+        printf("== If uncached framebuffer reads are reasonably fast (at least 100 MB/s ==\n");
+        printf("== or preferably >300 MB/s), then using the shadow framebuffer layer    ==\n");
+        printf("== is not necessary in Xorg DDX drivers, resulting in a nice overall    ==\n");
+        printf("== performance improvement. For example, the xf86-video-fbturbo DDX     ==\n");
+        printf("== uses this trick.                                                     ==\n");
+        printf("==========================================================================\n\n");
+
+        srcbuf = fbbuf;
+        if (bufsize > fbsize)
+            bufsize = fbsize;
+        bandwidth_bench(dstbuf, srcbuf, tmpbuf, bufsize, BLOCKSIZE, " ", bi);
+    }
+#endif
+
     free(poolbuf);
 
     printf("\n");