From 497a59aa31a9dd5c5e6167dba3e9ff69d6afc8d0 Mon Sep 17 00:00:00 2001
From: Thanassis Tsiodras <ttsiodras@gmail.com>
Date: Thu, 14 Jul 2022 21:59:16 +0200
Subject: [PATCH] Reintroduce SSE fallback, for my poor Atom x5-Z8350. All hail
 the Atomic PI!

---
 README        |  17 +++--
 README.md     |  17 +++--
 src/mandel.cc |  34 ++++++++--
 src/sse.cc    | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/sse.h     |   1 +
 src/xaos.cc   |   2 +-
 6 files changed, 229 insertions(+), 19 deletions(-)

diff --git a/README b/README
index 76d3389..f89b89b 100644
--- a/README
+++ b/README
@@ -9,7 +9,7 @@ COMPILE/INSTALL/RUN
 Windows
 -------
 Windows users can download and run a pre-compiled Windows binary
-[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.9/mandelSSE-win32-2.9.zip).
+[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.10/mandelSSE-win32-2.10.zip).
 
 After decompressing, you can simply execute either one of the two .bat
 files. The 'autopilot' one zooms in a specific location, while the other
@@ -32,12 +32,15 @@ You can then simply...
     $ src/mandelSSE -h
     Usage: ./src/mandelSSE [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT]
     Where:
-            -h	Show this help message
-            -m	Run in mouse-driven mode
-            -a	Run in autopilot mode (default)
-            -b	Run in benchmark mode (implies autopilot)
-            -f fps	Enforce upper bound of frames per second (default: 60)
-                    (use 0 to run at full possible speed)
+        -h      Show this help message
+        -m      Run in mouse-driven mode
+        -a      Run in autopilot mode (default)
+        -b      Run in benchmark mode (implies autopilot)
+        -v      Force use of AVX
+        -s      Force use of SSE
+        -d      Force use of non-AVX, non-SSE code
+        -f fps  Enforce upper bound of frames per second (default: 60)
+                (use 0 to run at full possible speed)
 
     If WIDTH and HEIGHT are not provided, they default to: 1024 768
 
diff --git a/README.md b/README.md
index 76d3389..f89b89b 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ COMPILE/INSTALL/RUN
 Windows
 -------
 Windows users can download and run a pre-compiled Windows binary
-[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.9/mandelSSE-win32-2.9.zip).
+[here](https://github.com/ttsiodras/MandelbrotSSE/releases/download/2.10/mandelSSE-win32-2.10.zip).
 
 After decompressing, you can simply execute either one of the two .bat
 files. The 'autopilot' one zooms in a specific location, while the other
@@ -32,12 +32,15 @@ You can then simply...
     $ src/mandelSSE -h
     Usage: ./src/mandelSSE [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT]
     Where:
-            -h	Show this help message
-            -m	Run in mouse-driven mode
-            -a	Run in autopilot mode (default)
-            -b	Run in benchmark mode (implies autopilot)
-            -f fps	Enforce upper bound of frames per second (default: 60)
-                    (use 0 to run at full possible speed)
+        -h      Show this help message
+        -m      Run in mouse-driven mode
+        -a      Run in autopilot mode (default)
+        -b      Run in benchmark mode (implies autopilot)
+        -v      Force use of AVX
+        -s      Force use of SSE
+        -d      Force use of non-AVX, non-SSE code
+        -f fps  Enforce upper bound of frames per second (default: 60)
+                (use 0 to run at full possible speed)
 
     If WIDTH and HEIGHT are not provided, they default to: 1024 768
 
diff --git a/src/mandel.cc b/src/mandel.cc
index 90e60a1..089e703 100644
--- a/src/mandel.cc
+++ b/src/mandel.cc
@@ -26,12 +26,15 @@
 
 void usage(char *argv[])
 {
-    printf("Usage: %s [-a|-m] [-h] [-b] [-f rate] [WIDTH HEIGHT]\n", argv[0]);
+    printf("Usage: %s [-a|-m] [-h] [-b] [-v|-s|-d] [-f rate] [WIDTH HEIGHT]\n", argv[0]);
     puts("Where:");
     puts("\t-h\tShow this help message");
     puts("\t-m\tRun in mouse-driven mode");
     puts("\t-a\tRun in autopilot mode (default)");
     puts("\t-b\tRun in benchmark mode (implies autopilot)");
+    puts("\t-v\tForce use of AVX");
+    puts("\t-s\tForce use of SSE");
+    puts("\t-d\tForce use of non-AVX, non-SSE code");
     puts("\t-f fps\tEnforce upper bound of frames per second (default: 60)");
     puts("\t      \t(use 0 to run at full possible speed)\n");
     puts("If WIDTH and HEIGHT are not provided, they default to: 1024 768");
@@ -42,8 +45,9 @@ int main(int argc, char *argv[])
 {
     int opt, fps = 60;
     bool autoPilot = true, benchmark = false;
+    bool forceAVX = false, forceSSE = false, forceDefault = false;
 
-    while ((opt = getopt(argc, argv, "hmabf:")) != -1) {
+    while ((opt = getopt(argc, argv, "hmabvsdf:")) != -1) {
         switch (opt) {
             case 'h':
                 usage(argv);
@@ -58,6 +62,15 @@ int main(int argc, char *argv[])
                 autoPilot = true;
                 benchmark = true;
                 break;
+            case 'v':
+                forceAVX = true;
+                break;
+            case 's':
+                forceSSE = true;
+                break;
+            case 'd':
+                forceDefault = true;
+                break;
             case 'f':
                 if (1 != sscanf(optarg, "%d", &fps))
                     panic("[x] Not a valid frame rate: '%s'", optarg);
@@ -112,8 +125,21 @@ int main(int argc, char *argv[])
     else
         printf("[-] FPS Limit:  %d frames/sec\n", fps);
 #ifdef __x86_64__
-    CoreLoopDouble = __builtin_cpu_supports("avx") ? CoreLoopDoubleAVX : CoreLoopDoubleDefault;
-    printf("[-] Mode: %s\n", __builtin_cpu_supports("avx") ? "AVX" : "non-AVX");
+    if (forceAVX)
+        CoreLoopDouble = CoreLoopDoubleAVX;
+    else if (forceSSE)
+        CoreLoopDouble = CoreLoopDoubleSSE;
+    else if (forceDefault)
+        CoreLoopDouble = CoreLoopDoubleDefault;
+    else
+        CoreLoopDouble = 
+            __builtin_cpu_supports("avx") ?  CoreLoopDoubleAVX
+            : __builtin_cpu_supports("sse") ?  CoreLoopDoubleSSE
+            : CoreLoopDoubleDefault;
+    printf("[-] Mode: %s\n", 
+        CoreLoopDouble == CoreLoopDoubleAVX ? "AVX" 
+        : CoreLoopDouble == CoreLoopDoubleSSE ? "SSE" 
+        : "non-AVX/non-SSE");
 #else
     CoreLoopDouble = CoreLoopDoubleDefault;
     printf("[-] Mode: %s\n", "non-AVX");
diff --git a/src/sse.cc b/src/sse.cc
index 6101f84..40419a1 100644
--- a/src/sse.cc
+++ b/src/sse.cc
@@ -91,6 +91,183 @@ void CoreLoopDoubleDefault(double xcur, double ycur, double xstep, unsigned char
 
 #ifdef __x86_64__
 
+void CoreLoopDoubleSSE(double xcur, double ycur, double xstep, unsigned char **p)
+{
+    DECLARE_ALIGNED(16,double,re[2]);
+    DECLARE_ALIGNED(16,double,im[2]);
+    DECLARE_ALIGNED(16,unsigned,k1[2]);
+
+    DECLARE_ALIGNED(16,double,outputs[2]);
+
+    re[0] = xcur;
+    re[1] = (xcur + xstep);
+
+    im[0] = im[1] = ycur;
+
+    k1[0] = k1[1] = 0;
+					      // x' = x^2 - y^2 + a
+					      // y' = 2xy + b
+					      //
+    asm("mov    %6,%%ecx\n\t"                 //  ecx is ITERA
+        "xor    %%ebx, %%ebx\n\t"             //  period = 0
+	"movapd %3,%%xmm5\n\t"                //  4.     4.        ; xmm5
+	"movapd %1,%%xmm6\n\t"                //  a0     a1        ; xmm6
+	"movaps %2,%%xmm7\n\t"                //  b0     b1        ; xmm7
+	"xorpd  %%xmm0,%%xmm0\n\t"            //  0.     0.        ; rez in xmm0
+	"xorpd  %%xmm1,%%xmm1\n\t"            //  0.     0.        ; imz in xmm1
+	"xorpd  %%xmm3,%%xmm3\n\t"            //  0.     0.        ; bailout counters
+	"xorpd  %%xmm8,%%xmm8\n\t"            //  0.     0.        ; bailout counters
+	"xorpd  %%xmm9,%%xmm9\n\t"            //  0.     0.        ; bailout counters
+
+	"1:\n\t"                              //  Main Mandelbrot computation
+	"movapd %%xmm0,%%xmm2\n\t"            //  x0     x1        ; xmm2
+	"mulpd  %%xmm1,%%xmm2\n\t"            //  x0*y0  x1*y1     ; xmm2
+	"mulpd  %%xmm0,%%xmm0\n\t"            //  x0^2   x1^2      ; xmm0
+	"mulpd  %%xmm1,%%xmm1\n\t"            //  y0^2   y1^2      ; xmm1
+	"movapd %%xmm0,%%xmm4\n\t"            //  
+	"addpd  %%xmm1,%%xmm4\n\t"            //  x0^2+y0^2  x1... ; xmm4
+	"subpd  %%xmm1,%%xmm0\n\t"            //  x0^2-y0^2  x1... ; xmm0
+	"addpd  %%xmm6,%%xmm0\n\t"            //  x0'    x1'       ; xmm0
+	"movapd %%xmm2,%%xmm1\n\t"            //  x0*y0  x1*y1     ; xmm1
+	"addpd  %%xmm1,%%xmm1\n\t"            //  2x0*y0 2x1*y1    ; xmm1
+	"addpd  %%xmm7,%%xmm1\n\t"            //  y0'    y1'       ; xmm1
+
+	"cmpltpd %%xmm5,%%xmm4\n\t"           //  <4     <4        ; xmm2
+	"movapd %%xmm4,%%xmm2\n\t"            //  xmm2 has all 1s in the non-overflowed pixels
+	"movmskpd %%xmm4,%%eax\n\t"           //  (lower 2 bits reflect comparisons)
+	"andpd  %4,%%xmm4\n\t"                //  so, prepare to increase the non-overflowed (and with ones)
+	"addpd  %%xmm4,%%xmm3\n\t"            //  by updating their counters
+
+	"or     %%eax,%%eax\n\t"              //  have both pixels overflowed ?
+
+	"je     2f\n\t"                       //  yes, jump forward to label 2 (hence, 2f) and end the loop
+	"dec    %%ecx\n\t"                    //  otherwise, repeat the loop ITERA times...
+	"jnz    22f\n\t"                      //  but before redoing the loop, first do periodicity checking
+
+                                              //  We've done the loop ITERA times.
+                                              //  Set non-overflowed outputs to 0 (inside xmm3). Here's how:
+	"movapd %%xmm2,%%xmm4\n\t"            //  xmm4 has all 1s in the non-overflowed pixels...
+	"xorpd  %5,%%xmm4\n\t"                //  xmm4 has all 1s in the overflowed pixels (toggled, via xoring with allbits)
+	"andpd  %%xmm4,%%xmm3\n\t"            //  zero out the xmm3 parts that belong to non-overflowed (set to black)
+	"jmp    2f\n\t"                       //  And jump to end of everything, where xmm3 is written into outputs
+
+	"22:\n\t"                             //  Periodicity checking
+        "inc %%bl\n\t"                        //  period++
+        "and $0xF, %%bl\n\t"                  //  period &= 0xF
+        "jnz 11f\n\t"                         //  if period is not zero, continue to check if we're seeing xold, yold again
+        "movapd %%xmm0, %%xmm8\n\t"           //  time to update xold[2], yold[2] - store xold[2] in xmm8
+        "movapd %%xmm1, %%xmm9\n\t"           //  and yold[2] in xmm9
+	"jmp    1b\n\t"                       //  and jump back to the loop beginning
+
+        "11:\n\t"                             //  are we seeing xold[2], yold[2] into our rez[2], imz[2]?
+        "movapd %%xmm8, %%xmm10\n\t"          //  the comparison instruction will modify the target XMM register, so use xmm10
+        "cmpeqpd %%xmm0, %%xmm10\n\t"         //  compare xmm10 (which now has xold[2]) with rez[2]. Set all 1s into xmm10 if equal
+	"movmskpd %%xmm10,%%eax\n\t"          //  the lower 2 bits of EAX now reflect the result of the comparison. 
+        "or %%eax, %%eax\n\t"                 //  are they BOTH zero?
+        "jz 1b\n\t"                           //  Yes - so, neither of the two rez matched with the two xold. Repeat the loop
+        "movapd %%xmm9, %%xmm10\n\t"          //  Set xmm10 to contain yold[2]
+        "cmpeqpd %%xmm1, %%xmm10\n\t"         //  compare xmm10 with imz[2]. Set all 1s into xmm10 if equal
+	"movmskpd %%xmm10,%%eax\n\t"          //  the lower 2 bits of EAX now reflect the result of the comparison.
+        "or %%eax, %%eax\n\t"                 //  are they BOTH zero?
+        "jz 1b\n\t"                           //  Yes - so, neither of the two imz matched with the two yold. Repeat the loop
+	"xorpd  %%xmm3,%%xmm3\n\t"            //  Repetition detected. Set both results to 0.0 (both pixels black)
+
+	"2:\n\t"
+	"movapd %%xmm3,%0\n\t"
+	:"=m"(outputs[0])
+	:"m"(re[0]),"m"(im[0]),"m"(fours[0]),"m"(ones[0]),"m"(allbits[0]),"i"(ITERA)
+	:"%eax","%ebx","%ecx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","memory");
+
+    int tmp = (int)(outputs[0]);
+    *(*p)++ = tmp;
+    tmp = (int)(outputs[1]);
+    *(*p)++ = tmp;
+
+    re[0] = xcur + 2*xstep;
+    re[1] = xcur + 3*xstep;
+
+    im[0] = im[1] = ycur;
+
+    k1[0] = k1[1] = 0;
+					      // x' = x^2 - y^2 + a
+					      // y' = 2xy + b
+					      //
+    asm("mov    %6,%%ecx\n\t"                 //  ecx is ITERA
+        "xor    %%ebx, %%ebx\n\t"             //  period = 0
+	"movapd %3,%%xmm5\n\t"                //  4.     4.        ; xmm5
+	"movapd %1,%%xmm6\n\t"                //  a0     a1        ; xmm6
+	"movaps %2,%%xmm7\n\t"                //  b0     b1        ; xmm7
+	"xorpd  %%xmm0,%%xmm0\n\t"            //  0.     0.        ; rez in xmm0
+	"xorpd  %%xmm1,%%xmm1\n\t"            //  0.     0.        ; imz in xmm1
+	"xorpd  %%xmm3,%%xmm3\n\t"            //  0.     0.        ; bailout counters
+	"xorpd  %%xmm8,%%xmm8\n\t"            //  0.     0.        ; bailout counters
+	"xorpd  %%xmm9,%%xmm9\n\t"            //  0.     0.        ; bailout counters
+
+	"1:\n\t"                              //  Main Mandelbrot computation
+	"movapd %%xmm0,%%xmm2\n\t"            //  x0     x1        ; xmm2
+	"mulpd  %%xmm1,%%xmm2\n\t"            //  x0*y0  x1*y1     ; xmm2
+	"mulpd  %%xmm0,%%xmm0\n\t"            //  x0^2   x1^2      ; xmm0
+	"mulpd  %%xmm1,%%xmm1\n\t"            //  y0^2   y1^2      ; xmm1
+	"movapd %%xmm0,%%xmm4\n\t"            //  
+	"addpd  %%xmm1,%%xmm4\n\t"            //  x0^2+y0^2  x1... ; xmm4
+	"subpd  %%xmm1,%%xmm0\n\t"            //  x0^2-y0^2  x1... ; xmm0
+	"addpd  %%xmm6,%%xmm0\n\t"            //  x0'    x1'       ; xmm0
+	"movapd %%xmm2,%%xmm1\n\t"            //  x0*y0  x1*y1     ; xmm1
+	"addpd  %%xmm1,%%xmm1\n\t"            //  2x0*y0 2x1*y1    ; xmm1
+	"addpd  %%xmm7,%%xmm1\n\t"            //  y0'    y1'       ; xmm1
+
+	"cmpltpd %%xmm5,%%xmm4\n\t"           //  <4     <4        ; xmm2
+	"movapd %%xmm4,%%xmm2\n\t"            //  xmm2 has all 1s in the non-overflowed pixels
+	"movmskpd %%xmm4,%%eax\n\t"           //  (lower 2 bits reflect comparisons)
+	"andpd  %4,%%xmm4\n\t"                //  so, prepare to increase the non-overflowed (and with ones)
+	"addpd  %%xmm4,%%xmm3\n\t"            //  by updating their counters
+
+	"or     %%eax,%%eax\n\t"              //  have both pixels overflowed ?
+
+	"je     2f\n\t"                       //  yes, jump forward to label 2 (hence, 2f) and end the loop
+	"dec    %%ecx\n\t"                    //  otherwise, repeat the loop ITERA times...
+	"jnz    22f\n\t"                      //  but before redoing the loop, first do periodicity checking
+
+                                              //  We've done the loop ITERA times.
+                                              //  Set non-overflowed outputs to 0 (inside xmm3). Here's how:
+	"movapd %%xmm2,%%xmm4\n\t"            //  xmm4 has all 1s in the non-overflowed pixels...
+	"xorpd  %5,%%xmm4\n\t"                //  xmm4 has all 1s in the overflowed pixels (toggled, via xoring with allbits)
+	"andpd  %%xmm4,%%xmm3\n\t"            //  zero out the xmm3 parts that belong to non-overflowed (set to black)
+	"jmp    2f\n\t"                       //  And jump to end of everything, where xmm3 is written into outputs
+
+	"22:\n\t"                             //  Periodicity checking
+        "inc %%bl\n\t"                        //  period++
+        "and $0xF, %%bl\n\t"                  //  period &= 0xF
+        "jnz 11f\n\t"                         //  if period is not zero, continue to check if we're seeing xold, yold again
+        "movapd %%xmm0, %%xmm8\n\t"           //  time to update xold[2], yold[2] - store xold[2] in xmm8
+        "movapd %%xmm1, %%xmm9\n\t"           //  and yold[2] in xmm9
+	"jmp    1b\n\t"                       //  and jump back to the loop beginning
+
+        "11:\n\t"                             //  are we seeing xold[2], yold[2] into our rez[2], imz[2]?
+        "movapd %%xmm8, %%xmm10\n\t"          //  the comparison instruction will modify the target XMM register, so use xmm10
+        "cmpeqpd %%xmm0, %%xmm10\n\t"         //  compare xmm10 (which now has xold[2]) with rez[2]. Set all 1s into xmm10 if equal
+	"movmskpd %%xmm10,%%eax\n\t"          //  the lower 2 bits of EAX now reflect the result of the comparison. 
+        "or %%eax, %%eax\n\t"                 //  are they BOTH zero?
+        "jz 1b\n\t"                           //  Yes - so, neither of the two rez matched with the two xold. Repeat the loop
+        "movapd %%xmm9, %%xmm10\n\t"          //  Set xmm10 to contain yold[2]
+        "cmpeqpd %%xmm1, %%xmm10\n\t"         //  compare xmm10 with imz[2]. Set all 1s into xmm10 if equal
+	"movmskpd %%xmm10,%%eax\n\t"          //  the lower 2 bits of EAX now reflect the result of the comparison.
+        "or %%eax, %%eax\n\t"                 //  are they BOTH zero?
+        "jz 1b\n\t"                           //  Yes - so, neither of the two imz matched with the two yold. Repeat the loop
+	"xorpd  %%xmm3,%%xmm3\n\t"            //  Repetition detected. Set both results to 0.0 (both pixels black)
+
+	"2:\n\t"
+	"movapd %%xmm3,%0\n\t"
+	:"=m"(outputs[0])
+	:"m"(re[0]),"m"(im[0]),"m"(fours[0]),"m"(ones[0]),"m"(allbits[0]),"i"(ITERA)
+	:"%eax","%ebx","%ecx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","memory");
+
+    tmp = (int)(outputs[0]);
+    *(*p)++ = tmp;
+    tmp = (int)(outputs[1]);
+    *(*p)++ = tmp;
+}
+
 void CoreLoopDoubleAVX(double xcur, double ycur, double xstep, unsigned char **p)
 {
     DECLARE_ALIGNED(32,double,re[4]);
diff --git a/src/sse.h b/src/sse.h
index f2bef22..1e81421 100644
--- a/src/sse.h
+++ b/src/sse.h
@@ -2,6 +2,7 @@
 #define __MANDELSSE_H__
 
 void CoreLoopDoubleDefault(double xcur, double ycur, double xstep, unsigned char **p);
+void CoreLoopDoubleSSE(double xcur, double ycur, double xstep, unsigned char **p);
 void CoreLoopDoubleAVX(double xcur, double ycur, double xstep, unsigned char **p);
 
 #endif
diff --git a/src/xaos.cc b/src/xaos.cc
index 84de03f..8a7d943 100644
--- a/src/xaos.cc
+++ b/src/xaos.cc
@@ -28,7 +28,7 @@ int compare_points(const void *p1, const void *p2)
 }
 
 #if defined(__x86_64__) && !defined(__WIN64__)
-#define AUTO_DISPATCH __attribute__((target_clones("default","avx")))
+#define AUTO_DISPATCH __attribute__((target_clones("default","sse","avx")))
 #else
 #define AUTO_DISPATCH
 #endif