From dd994f5f5fb171bf44ca75b224f88aa73f8554b4 Mon Sep 17 00:00:00 2001 From: Thanassis Tsiodras Date: Thu, 21 Jul 2022 20:12:35 +0200 Subject: [PATCH] Reverted change of 'dec ecx' to 'sub ecx,1' After benchmarking on Ivy Bridge and Sky Lake (the only two platforms I have access to) it appears that /u/FUZxxl was wrong: 'dec ecx' performs faster. --- src/sse.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sse.cc b/src/sse.cc index ca0d017..586e728 100644 --- a/src/sse.cc +++ b/src/sse.cc @@ -141,7 +141,7 @@ void CoreLoopDoubleSSE(double xcur, double ycur, double xstep, unsigned char **p "test %%eax,%%eax\n\t" // have both pixels overflowed ? "je 2f\n\t" // yes, jump forward to label 2 (hence, 2f) and end the loop - "subl $1, %%ecx\n\t" // otherwise, repeat the loop iterations times... + "dec %%ecx\n\t" // otherwise, repeat the loop iterations times... "jnz 22f\n\t" // but before redoing the loop, first do periodicity checking // We've done the loop 'iterations' times. @@ -225,7 +225,7 @@ void CoreLoopDoubleSSE(double xcur, double ycur, double xstep, unsigned char **p "test %%eax,%%eax\n\t" // have both pixels overflowed ? "je 2f\n\t" // yes, jump forward to label 2 (hence, 2f) and end the loop - "subl $1, %%ecx\n\t" // otherwise, repeat the loop 'iterations' times... + "dec %%ecx\n\t" // otherwise, repeat the loop 'iterations' times... "jnz 22f\n\t" // but before redoing the loop, first do periodicity checking // We've done the loop 'iterations' times. @@ -317,7 +317,7 @@ void CoreLoopDoubleAVX(double xcur, double ycur, double xstep, unsigned char **p "test %%eax,%%eax\n\t" // have all 4 pixels overflowed ? "je 2f\n\t" // yes, jump forward to label 2 (hence, 2f) and end the loop // - "subl $1, %%ecx\n\t" // otherwise, repeat the loop up to iterations times... + "dec %%ecx\n\t" // otherwise, repeat the loop up to iterations times... "jnz 22f\n\t" // but before redoing the loop, first do periodicity checking // We've done the loop iterations times.