From 497a59aa31a9dd5c5e6167dba3e9ff69d6afc8d0 Mon Sep 17 00:00:00 2001 From: Thanassis Tsiodras Date: Thu, 14 Jul 2022 21:59:16 +0200 Subject: [PATCH] Reintroduce SSE fallback, for my poor Atom x5-Z8350. All hail the Atomic PI! --- README | 17 +++-- README.md | 17 +++-- src/mandel.cc | 34 ++++++++-- src/sse.cc | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/sse.h | 1 + src/xaos.cc | 2 +- 6 files changed, 229 insertions(+), 19 deletions(-) diff --git a/README b/README index 76d3389..f89b89b 100644 --- a/README +++ b/README @@ -9,7 +9,7 @@ COMPILE/INSTALL/RUN Windows ------- Windows users can download and run a pre-compiled Windows binary -[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.9/mandelSSE-win32-2.9.zip). +[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.10/mandelSSE-win32-2.10.zip). After decompressing, you can simply execute either one of the two .bat files. The 'autopilot' one zooms in a specific location, while the other @@ -32,12 +32,15 @@ You can then simply... $ src/mandelSSE -h Usage: ./src/mandelSSE [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT] Where: - -h Show this help message - -m Run in mouse-driven mode - -a Run in autopilot mode (default) - -b Run in benchmark mode (implies autopilot) - -f fps Enforce upper bound of frames per second (default: 60) - (use 0 to run at full possible speed) + -h Show this help message + -m Run in mouse-driven mode + -a Run in autopilot mode (default) + -b Run in benchmark mode (implies autopilot) + -v Force use of AVX + -s Force use of SSE + -d Force use of non-AVX, non-SSE code + -f fps Enforce upper bound of frames per second (default: 60) + (use 0 to run at full possible speed) If WIDTH and HEIGHT are not provided, they default to: 1024 768 diff --git a/README.md b/README.md index 76d3389..f89b89b 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ COMPILE/INSTALL/RUN Windows ------- Windows users can download and run a pre-compiled Windows binary -[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.9/mandelSSE-win32-2.9.zip). +[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.10/mandelSSE-win32-2.10.zip). After decompressing, you can simply execute either one of the two .bat files. The 'autopilot' one zooms in a specific location, while the other @@ -32,12 +32,15 @@ You can then simply... $ src/mandelSSE -h Usage: ./src/mandelSSE [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT] Where: - -h Show this help message - -m Run in mouse-driven mode - -a Run in autopilot mode (default) - -b Run in benchmark mode (implies autopilot) - -f fps Enforce upper bound of frames per second (default: 60) - (use 0 to run at full possible speed) + -h Show this help message + -m Run in mouse-driven mode + -a Run in autopilot mode (default) + -b Run in benchmark mode (implies autopilot) + -v Force use of AVX + -s Force use of SSE + -d Force use of non-AVX, non-SSE code + -f fps Enforce upper bound of frames per second (default: 60) + (use 0 to run at full possible speed) If WIDTH and HEIGHT are not provided, they default to: 1024 768 diff --git a/src/mandel.cc b/src/mandel.cc index 90e60a1..089e703 100644 --- a/src/mandel.cc +++ b/src/mandel.cc @@ -26,12 +26,15 @@ void usage(char *argv[]) { - printf("Usage: %s [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT]\n", argv[0]); + printf("Usage: %s [-a|-m] [-h] [-b] [-v|-s|-d] [-f rate] [WIDTH HEIGHT]\n", argv[0]); puts("Where:"); puts("\t-h\tShow this help message"); puts("\t-m\tRun in mouse-driven mode"); puts("\t-a\tRun in autopilot mode (default)"); puts("\t-b\tRun in benchmark mode (implies autopilot)"); + puts("\t-v\tForce use of AVX"); + puts("\t-s\tForce use of SSE"); + puts("\t-d\tForce use of non-AVX, non-SSE code"); puts("\t-f fps\tEnforce upper bound of frames per second (default: 60)"); puts("\t \t(use 0 to run at full possible speed)\n"); puts("If WIDTH and HEIGHT are not provided, they default to: 1024 768"); @@ -42,8 +45,9 @@ int main(int argc, char *argv[]) { int opt, fps = 60; bool autoPilot = true, benchmark = false; + bool forceAVX = false, forceSSE = false, forceDefault = false; - while ((opt = getopt(argc, argv, "hmabf:")) != -1) { + while ((opt = getopt(argc, argv, "hmabvsdf:")) != -1) { switch (opt) { case 'h': usage(argv); @@ -58,6 +62,15 @@ int main(int argc, char *argv[]) autoPilot = true; benchmark = true; break; + case 'v': + forceAVX = true; + break; + case 's': + forceSSE = true; + break; + case 'd': + forceDefault = true; + break; case 'f': if (1 != sscanf(optarg, "%d", &fps)) panic("[x] Not a valid frame rate: '%s'", optarg); @@ -112,8 +125,21 @@ int main(int argc, char *argv[]) else printf("[-] FPS Limit: %d frames/sec\n", fps); #ifdef __x86_64__ - CoreLoopDouble = __builtin_cpu_supports("avx") ? CoreLoopDoubleAVX : CoreLoopDoubleDefault; - printf("[-] Mode: %s\n", __builtin_cpu_supports("avx") ? "AVX" : "non-AVX"); + if (forceAVX) + CoreLoopDouble = CoreLoopDoubleAVX; + else if (forceSSE) + CoreLoopDouble = CoreLoopDoubleSSE; + else if (forceDefault) + CoreLoopDouble = CoreLoopDoubleDefault; + else + CoreLoopDouble = + __builtin_cpu_supports("avx") ? CoreLoopDoubleAVX + : __builtin_cpu_supports("sse") ? CoreLoopDoubleSSE + : CoreLoopDoubleDefault; + printf("[-] Mode: %s\n", + CoreLoopDouble == CoreLoopDoubleAVX ? "AVX" + : CoreLoopDouble == CoreLoopDoubleSSE ? "SSE" + : "non-AVX/non-SSE"); #else CoreLoopDouble = CoreLoopDoubleDefault; printf("[-] Mode: %s\n", "non-AVX"); diff --git a/src/sse.cc b/src/sse.cc index 6101f84..40419a1 100644 --- a/src/sse.cc +++ b/src/sse.cc @@ -91,6 +91,183 @@ void CoreLoopDoubleDefault(double xcur, double ycur, double xstep, unsigned char #ifdef __x86_64__ +void CoreLoopDoubleSSE(double xcur, double ycur, double xstep, unsigned char **p) +{ + DECLARE_ALIGNED(16,double,re[2]); + DECLARE_ALIGNED(16,double,im[2]); + DECLARE_ALIGNED(16,unsigned,k1[2]); + + DECLARE_ALIGNED(16,double,outputs[2]); + + re[0] = xcur; + re[1] = (xcur + xstep); + + im[0] = im[1] = ycur; + + k1[0] = k1[1] = 0; + // x' = x^2 - y^2 + a + // y' = 2xy + b + // + asm("mov %6,%%ecx\n\t" // ecx is ITERA + "xor %%ebx, %%ebx\n\t" // period = 0 + "movapd %3,%%xmm5\n\t" // 4. 4. ; xmm5 + "movapd %1,%%xmm6\n\t" // a0 a1 ; xmm6 + "movaps %2,%%xmm7\n\t" // b0 b1 ; xmm7 + "xorpd %%xmm0,%%xmm0\n\t" // 0. 0. ; rez in xmm0 + "xorpd %%xmm1,%%xmm1\n\t" // 0. 0. ; imz in xmm1 + "xorpd %%xmm3,%%xmm3\n\t" // 0. 0. ; bailout counters + "xorpd %%xmm8,%%xmm8\n\t" // 0. 0. ; bailout counters + "xorpd %%xmm9,%%xmm9\n\t" // 0. 0. ; bailout counters + + "1:\n\t" // Main Mandelbrot computation + "movapd %%xmm0,%%xmm2\n\t" // x0 x1 ; xmm2 + "mulpd %%xmm1,%%xmm2\n\t" // x0*y0 x1*y1 ; xmm2 + "mulpd %%xmm0,%%xmm0\n\t" // x0^2 x1^2 ; xmm0 + "mulpd %%xmm1,%%xmm1\n\t" // y0^2 y1^2 ; xmm1 + "movapd %%xmm0,%%xmm4\n\t" // + "addpd %%xmm1,%%xmm4\n\t" // x0^2+y0^2 x1... ; xmm4 + "subpd %%xmm1,%%xmm0\n\t" // x0^2-y0^2 x1... ; xmm0 + "addpd %%xmm6,%%xmm0\n\t" // x0' x1' ; xmm0 + "movapd %%xmm2,%%xmm1\n\t" // x0*y0 x1*y1 ; xmm1 + "addpd %%xmm1,%%xmm1\n\t" // 2x0*y0 2x1*y1 ; xmm1 + "addpd %%xmm7,%%xmm1\n\t" // y0' y1' ; xmm1 + + "cmpltpd %%xmm5,%%xmm4\n\t" // <4 <4 ; xmm2 + "movapd %%xmm4,%%xmm2\n\t" // xmm2 has all 1s in the non-overflowed pixels + "movmskpd %%xmm4,%%eax\n\t" // (lower 2 bits reflect comparisons) + "andpd %4,%%xmm4\n\t" // so, prepare to increase the non-overflowed (and with ones) + "addpd %%xmm4,%%xmm3\n\t" // by updating their counters + + "or %%eax,%%eax\n\t" // have both pixels overflowed ? + + "je 2f\n\t" // yes, jump forward to label 2 (hence, 2f) and end the loop + "dec %%ecx\n\t" // otherwise, repeat the loop ITERA times... + "jnz 22f\n\t" // but before redoing the loop, first do periodicity checking + + // We've done the loop ITERA times. + // Set non-overflowed outputs to 0 (inside xmm3). Here's how: + "movapd %%xmm2,%%xmm4\n\t" // xmm4 has all 1s in the non-overflowed pixels... + "xorpd %5,%%xmm4\n\t" // xmm4 has all 1s in the overflowed pixels (toggled, via xoring with allbits) + "andpd %%xmm4,%%xmm3\n\t" // zero out the xmm3 parts that belong to non-overflowed (set to black) + "jmp 2f\n\t" // And jump to end of everything, where xmm3 is written into outputs + + "22:\n\t" // Periodicity checking + "inc %%bl\n\t" // period++ + "and $0xF, %%bl\n\t" // period &= 0xF + "jnz 11f\n\t" // if period is not zero, continue to check if we're seeing xold, yold again + "movapd %%xmm0, %%xmm8\n\t" // time to update xold[2], yold[2] - store xold[2] in xmm8 + "movapd %%xmm1, %%xmm9\n\t" // and yold[2] in xmm9 + "jmp 1b\n\t" // and jump back to the loop beginning + + "11:\n\t" // are we seeing xold[2], yold[2] into our rez[2], imz[2]? + "movapd %%xmm8, %%xmm10\n\t" // the comparison instruction will modify the target XMM register, so use xmm10 + "cmpeqpd %%xmm0, %%xmm10\n\t" // compare xmm10 (which now has xold[2]) with rez[2]. Set all 1s into xmm10 if equal + "movmskpd %%xmm10,%%eax\n\t" // the lower 2 bits of EAX now reflect the result of the comparison. + "or %%eax, %%eax\n\t" // are they BOTH zero? + "jz 1b\n\t" // Yes - so, neither of the two rez matched with the two xold. Repeat the loop + "movapd %%xmm9, %%xmm10\n\t" // Set xmm10 to contain yold[2] + "cmpeqpd %%xmm1, %%xmm10\n\t" // compare xmm10 with imz[2]. Set all 1s into xmm10 if equal + "movmskpd %%xmm10,%%eax\n\t" // the lower 2 bits of EAX now reflect the result of the comparison. + "or %%eax, %%eax\n\t" // are they BOTH zero? + "jz 1b\n\t" // Yes - so, neither of the two imz matched with the two yold. Repeat the loop + "xorpd %%xmm3,%%xmm3\n\t" // Repetition detected. Set both results to 0.0 (both pixels black) + + "2:\n\t" + "movapd %%xmm3,%0\n\t" + :"=m"(outputs[0]) + :"m"(re[0]),"m"(im[0]),"m"(fours[0]),"m"(ones[0]),"m"(allbits[0]),"i"(ITERA) + :"%eax","%ebx","%ecx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","memory"); + + int tmp = (int)(outputs[0]); + *(*p)++ = tmp; + tmp = (int)(outputs[1]); + *(*p)++ = tmp; + + re[0] = xcur + 2*xstep; + re[1] = xcur + 3*xstep; + + im[0] = im[1] = ycur; + + k1[0] = k1[1] = 0; + // x' = x^2 - y^2 + a + // y' = 2xy + b + // + asm("mov %6,%%ecx\n\t" // ecx is ITERA + "xor %%ebx, %%ebx\n\t" // period = 0 + "movapd %3,%%xmm5\n\t" // 4. 4. ; xmm5 + "movapd %1,%%xmm6\n\t" // a0 a1 ; xmm6 + "movaps %2,%%xmm7\n\t" // b0 b1 ; xmm7 + "xorpd %%xmm0,%%xmm0\n\t" // 0. 0. ; rez in xmm0 + "xorpd %%xmm1,%%xmm1\n\t" // 0. 0. ; imz in xmm1 + "xorpd %%xmm3,%%xmm3\n\t" // 0. 0. ; bailout counters + "xorpd %%xmm8,%%xmm8\n\t" // 0. 0. ; bailout counters + "xorpd %%xmm9,%%xmm9\n\t" // 0. 0. ; bailout counters + + "1:\n\t" // Main Mandelbrot computation + "movapd %%xmm0,%%xmm2\n\t" // x0 x1 ; xmm2 + "mulpd %%xmm1,%%xmm2\n\t" // x0*y0 x1*y1 ; xmm2 + "mulpd %%xmm0,%%xmm0\n\t" // x0^2 x1^2 ; xmm0 + "mulpd %%xmm1,%%xmm1\n\t" // y0^2 y1^2 ; xmm1 + "movapd %%xmm0,%%xmm4\n\t" // + "addpd %%xmm1,%%xmm4\n\t" // x0^2+y0^2 x1... ; xmm4 + "subpd %%xmm1,%%xmm0\n\t" // x0^2-y0^2 x1... ; xmm0 + "addpd %%xmm6,%%xmm0\n\t" // x0' x1' ; xmm0 + "movapd %%xmm2,%%xmm1\n\t" // x0*y0 x1*y1 ; xmm1 + "addpd %%xmm1,%%xmm1\n\t" // 2x0*y0 2x1*y1 ; xmm1 + "addpd %%xmm7,%%xmm1\n\t" // y0' y1' ; xmm1 + + "cmpltpd %%xmm5,%%xmm4\n\t" // <4 <4 ; xmm2 + "movapd %%xmm4,%%xmm2\n\t" // xmm2 has all 1s in the non-overflowed pixels + "movmskpd %%xmm4,%%eax\n\t" // (lower 2 bits reflect comparisons) + "andpd %4,%%xmm4\n\t" // so, prepare to increase the non-overflowed (and with ones) + "addpd %%xmm4,%%xmm3\n\t" // by updating their counters + + "or %%eax,%%eax\n\t" // have both pixels overflowed ? + + "je 2f\n\t" // yes, jump forward to label 2 (hence, 2f) and end the loop + "dec %%ecx\n\t" // otherwise, repeat the loop ITERA times... + "jnz 22f\n\t" // but before redoing the loop, first do periodicity checking + + // We've done the loop ITERA times. + // Set non-overflowed outputs to 0 (inside xmm3). Here's how: + "movapd %%xmm2,%%xmm4\n\t" // xmm4 has all 1s in the non-overflowed pixels... + "xorpd %5,%%xmm4\n\t" // xmm4 has all 1s in the overflowed pixels (toggled, via xoring with allbits) + "andpd %%xmm4,%%xmm3\n\t" // zero out the xmm3 parts that belong to non-overflowed (set to black) + "jmp 2f\n\t" // And jump to end of everything, where xmm3 is written into outputs + + "22:\n\t" // Periodicity checking + "inc %%bl\n\t" // period++ + "and $0xF, %%bl\n\t" // period &= 0xF + "jnz 11f\n\t" // if period is not zero, continue to check if we're seeing xold, yold again + "movapd %%xmm0, %%xmm8\n\t" // time to update xold[2], yold[2] - store xold[2] in xmm8 + "movapd %%xmm1, %%xmm9\n\t" // and yold[2] in xmm9 + "jmp 1b\n\t" // and jump back to the loop beginning + + "11:\n\t" // are we seeing xold[2], yold[2] into our rez[2], imz[2]? + "movapd %%xmm8, %%xmm10\n\t" // the comparison instruction will modify the target XMM register, so use xmm10 + "cmpeqpd %%xmm0, %%xmm10\n\t" // compare xmm10 (which now has xold[2]) with rez[2]. Set all 1s into xmm10 if equal + "movmskpd %%xmm10,%%eax\n\t" // the lower 2 bits of EAX now reflect the result of the comparison. + "or %%eax, %%eax\n\t" // are they BOTH zero? + "jz 1b\n\t" // Yes - so, neither of the two rez matched with the two xold. Repeat the loop + "movapd %%xmm9, %%xmm10\n\t" // Set xmm10 to contain yold[2] + "cmpeqpd %%xmm1, %%xmm10\n\t" // compare xmm10 with imz[2]. Set all 1s into xmm10 if equal + "movmskpd %%xmm10,%%eax\n\t" // the lower 2 bits of EAX now reflect the result of the comparison. + "or %%eax, %%eax\n\t" // are they BOTH zero? + "jz 1b\n\t" // Yes - so, neither of the two imz matched with the two yold. Repeat the loop + "xorpd %%xmm3,%%xmm3\n\t" // Repetition detected. Set both results to 0.0 (both pixels black) + + "2:\n\t" + "movapd %%xmm3,%0\n\t" + :"=m"(outputs[0]) + :"m"(re[0]),"m"(im[0]),"m"(fours[0]),"m"(ones[0]),"m"(allbits[0]),"i"(ITERA) + :"%eax","%ebx","%ecx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","memory"); + + tmp = (int)(outputs[0]); + *(*p)++ = tmp; + tmp = (int)(outputs[1]); + *(*p)++ = tmp; +} + void CoreLoopDoubleAVX(double xcur, double ycur, double xstep, unsigned char **p) { DECLARE_ALIGNED(32,double,re[4]); diff --git a/src/sse.h b/src/sse.h index f2bef22..1e81421 100644 --- a/src/sse.h +++ b/src/sse.h @@ -2,6 +2,7 @@ #define __MANDELSSE_H__ void CoreLoopDoubleDefault(double xcur, double ycur, double xstep, unsigned char **p); +void CoreLoopDoubleSSE(double xcur, double ycur, double xstep, unsigned char **p); void CoreLoopDoubleAVX(double xcur, double ycur, double xstep, unsigned char **p); #endif diff --git a/src/xaos.cc b/src/xaos.cc index 84de03f..8a7d943 100644 --- a/src/xaos.cc +++ b/src/xaos.cc @@ -28,7 +28,7 @@ int compare_points(const void *p1, const void *p2) } #if defined(__x86_64__) && !defined(__WIN64__) -#define AUTO_DISPATCH __attribute__((target_clones("default","avx"))) +#define AUTO_DISPATCH __attribute__((target_clones("default","sse","avx"))) #else #define AUTO_DISPATCH #endif