-
Notifications
You must be signed in to change notification settings - Fork 49
/
Copy pathBlake2s.c
2672 lines (2318 loc) · 79.8 KB
/
Blake2s.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Blake2s.c -- BLAKE2sp Hash
2024-05-18 : Igor Pavlov : Public domain
2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */
#include "Precomp.h"
// #include <stdio.h>
#include <string.h>
#include "Blake2.h"
#include "RotateDefs.h"
#include "Compiler.h"
#include "CpuArch.h"
/*
if defined(__AVX512F__) && defined(__AVX512VL__)
{
we define Z7_BLAKE2S_USE_AVX512_ALWAYS,
but the compiler can use avx512 for any code.
}
else if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
{ we use avx512 only for sse* and avx* branches of code. }
*/
// #define Z7_BLAKE2S_USE_AVX512_ALWAYS // for debug
#if defined(__SSE2__)
#define Z7_BLAKE2S_USE_VECTORS
#elif defined(MY_CPU_X86_OR_AMD64)
#if defined(_MSC_VER) && _MSC_VER > 1200 \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
|| defined(__clang__) \
|| defined(__INTEL_COMPILER)
#define Z7_BLAKE2S_USE_VECTORS
#endif
#endif
#ifdef Z7_BLAKE2S_USE_VECTORS
#define Z7_BLAKE2SP_USE_FUNCTIONS
// define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes.
// #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
// SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%.
#if defined(__SSSE3__)
#define Z7_BLAKE2S_USE_SSSE3
#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
#define Z7_BLAKE2S_USE_SSSE3
#endif
#ifdef Z7_BLAKE2S_USE_SSSE3
/* SSE41 : for _mm_insert_epi32 (pinsrd)
it can slightly reduce code size and improves the performance in some cases.
it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used.
it can be used for all blocks in another algos (4+).
*/
#if defined(__SSE4_1__)
#define Z7_BLAKE2S_USE_SSE41
#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
#define Z7_BLAKE2S_USE_SSE41
#endif
#endif // SSSE3
#if defined(__GNUC__) || defined(__clang__)
#if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) && !(defined(__AVX512F__) && defined(__AVX512VL__))
#define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("avx512vl,avx512f")))
#else
#if defined(Z7_BLAKE2S_USE_SSE41)
#define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1")))
#elif defined(Z7_BLAKE2S_USE_SSSE3)
#define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3")))
#else
#define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2")))
#endif
#endif
#endif
#if defined(__AVX2__)
#define Z7_BLAKE2S_USE_AVX2
#else
#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100)
#define Z7_BLAKE2S_USE_AVX2
#ifdef Z7_BLAKE2S_USE_AVX2
#if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) && !(defined(__AVX512F__) && defined(__AVX512VL__))
#define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx512vl,avx512f")))
#else
#define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2")))
#endif
#endif
#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
#if (Z7_MSC_VER_ORIGINAL == 1900)
#pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
#endif
#define Z7_BLAKE2S_USE_AVX2
#endif
#endif
#ifdef Z7_BLAKE2S_USE_SSE41
#include <smmintrin.h> // SSE4.1
#elif defined(Z7_BLAKE2S_USE_SSSE3)
#include <tmmintrin.h> // SSSE3
#else
#include <emmintrin.h> // SSE2
#endif
#ifdef Z7_BLAKE2S_USE_AVX2
#include <immintrin.h>
#if defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#endif
#endif // avx2
#if defined(__AVX512F__) && defined(__AVX512VL__)
// && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930)
#ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
#define Z7_BLAKE2S_USE_AVX512_ALWAYS
#endif
// #pragma message ("=== Blake2s AVX512")
#endif
#define Z7_BLAKE2S_USE_V128_FAST
// for speed optimization for small messages:
// #define Z7_BLAKE2S_USE_V128_WAY2
#ifdef Z7_BLAKE2S_USE_AVX2
// for debug:
// gather is slow
// #define Z7_BLAKE2S_USE_GATHER
#define Z7_BLAKE2S_USE_AVX2_FAST
// for speed optimization for small messages:
// #define Z7_BLAKE2S_USE_AVX2_WAY2
// #define Z7_BLAKE2S_USE_AVX2_WAY4
#if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \
defined(Z7_BLAKE2S_USE_AVX2_WAY4)
#define Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#endif
#endif
#define Z7_BLAKE2SP_ALGO_DEFAULT 0
#define Z7_BLAKE2SP_ALGO_SCALAR 1
#ifdef Z7_BLAKE2S_USE_V128_FAST
#define Z7_BLAKE2SP_ALGO_V128_FAST 2
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
#define Z7_BLAKE2SP_ALGO_V256_FAST 3
#endif
#define Z7_BLAKE2SP_ALGO_V128_WAY1 4
#ifdef Z7_BLAKE2S_USE_V128_WAY2
#define Z7_BLAKE2SP_ALGO_V128_WAY2 5
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
#define Z7_BLAKE2SP_ALGO_V256_WAY2 6
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
#define Z7_BLAKE2SP_ALGO_V256_WAY4 7
#endif
#endif // Z7_BLAKE2S_USE_VECTORS
#define BLAKE2S_FINAL_FLAG (~(UInt32)0)
#define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS
#define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE)
#define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1)
#define V_INDEX_0_0 0
#define V_INDEX_1_0 1
#define V_INDEX_2_0 2
#define V_INDEX_3_0 3
#define V_INDEX_0_1 4
#define V_INDEX_1_1 5
#define V_INDEX_2_1 6
#define V_INDEX_3_1 7
#define V_INDEX_0_2 8
#define V_INDEX_1_2 9
#define V_INDEX_2_2 10
#define V_INDEX_3_2 11
#define V_INDEX_0_3 12
#define V_INDEX_1_3 13
#define V_INDEX_2_3 14
#define V_INDEX_3_3 15
#define V_INDEX_4_0 0
#define V_INDEX_5_0 1
#define V_INDEX_6_0 2
#define V_INDEX_7_0 3
#define V_INDEX_7_1 4
#define V_INDEX_4_1 5
#define V_INDEX_5_1 6
#define V_INDEX_6_1 7
#define V_INDEX_6_2 8
#define V_INDEX_7_2 9
#define V_INDEX_4_2 10
#define V_INDEX_5_2 11
#define V_INDEX_5_3 12
#define V_INDEX_6_3 13
#define V_INDEX_7_3 14
#define V_INDEX_4_3 15
#define V(row, col) v[V_INDEX_ ## row ## _ ## col]
#define k_Blake2s_IV_0 0x6A09E667UL
#define k_Blake2s_IV_1 0xBB67AE85UL
#define k_Blake2s_IV_2 0x3C6EF372UL
#define k_Blake2s_IV_3 0xA54FF53AUL
#define k_Blake2s_IV_4 0x510E527FUL
#define k_Blake2s_IV_5 0x9B05688CUL
#define k_Blake2s_IV_6 0x1F83D9ABUL
#define k_Blake2s_IV_7 0x5BE0CD19UL
#define KIV(n) (k_Blake2s_IV_## n)
#ifdef Z7_BLAKE2S_USE_VECTORS
MY_ALIGN(16)
static const UInt32 k_Blake2s_IV[8] =
{
KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7)
};
#endif
#define STATE_T(s) ((s) + 8)
#define STATE_F(s) ((s) + 10)
#ifdef Z7_BLAKE2S_USE_VECTORS
#define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p))
#define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
#ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
// here we use unaligned load and stores
// use this branch if CBlake2sp can be unaligned for 16 bytes
#define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
#define LOAD_128_FROM_STRUCT(p) LOADU_128(p)
#define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r)
#else
// here we use aligned load and stores
// use this branch if CBlake2sp is aligned for 16 bytes
#define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r)
#define LOAD_128_FROM_STRUCT(p) LOAD_128(p)
#define STORE_128_TO_STRUCT(p, r) STORE_128(p, r)
#endif
#endif // Z7_BLAKE2S_USE_VECTORS
#if 0
static void PrintState(const UInt32 *s, unsigned num)
{
unsigned i;
printf("\n");
for (i = 0; i < num; i++)
printf(" %08x", (unsigned)s[i]);
}
static void PrintStates2(const UInt32 *s, unsigned x, unsigned y)
{
unsigned i;
for (i = 0; i < y; i++)
PrintState(s + i * x, x);
printf("\n");
}
#endif
#define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) }
#define BLAKE2S_NUM_ROUNDS 10
#if defined(Z7_BLAKE2S_USE_VECTORS)
#define ROUNDS_LOOP(mac) \
{ unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) }
#endif
/*
#define ROUNDS_LOOP_2(mac) \
{ unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } }
*/
#if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
#define ROUNDS_LOOP_UNROLLED(m) \
{ m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) }
#endif
#define SIGMA_TABLE(M) \
M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \
M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \
M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \
M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \
M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \
M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \
M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \
M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \
M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \
M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 )
#define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
{ a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m }
#define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
// MY_ALIGN(32)
MY_ALIGN(16)
static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] =
{ SIGMA_TABLE(SIGMA_TABLE_MULT_4) };
#define GET_SIGMA_PTR(p, index) \
((const void *)((const Byte *)(const void *)(p) + (index)))
#define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \
((UInt32 *)(void *)((Byte *)(void *)(s) + (pos)))
#ifdef Z7_BLAKE2S_USE_VECTORS
#if 0
// use loading constants from memory
// is faster for some compilers.
#define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n)
MY_ALIGN(64)
static const UInt32 k_Blake2s_IV_WAY4[]=
{
KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7)
};
#define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i))
#else
// use constant generation:
#define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i))
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#define GET_CONST_128_FROM_ARRAY32(k) \
_mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0])
#endif
#if 0
#define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
#define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
#define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)
#define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)
#define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)
#else
#if defined(Z7_BLAKE2S_USE_SSSE3) && \
!defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 };
MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 };
#define k_r8 LOAD_128(k_r8_arr)
#define k_r16 LOAD_128(k_r16_arr)
#endif
MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 };
#define k_inc LOAD_128(k_inc_arr)
#define k_iv0_128 LOAD_128(k_Blake2s_IV + 0)
#define k_iv4_128 LOAD_128(k_Blake2s_IV + 4)
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#ifdef Z7_BLAKE2S_USE_AVX2
#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
#define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
#else
#define MY_mm256_set_m128i _mm256_set_m128i
#endif
#define SET_FROM_128(a) MY_mm256_set_m128i(a, a)
#ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
MY_ALIGN(32) static const Byte k_r8_arr_256 [32] =
{
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12,
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12
};
MY_ALIGN(32) static const Byte k_r16_arr_256[32] =
{
2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13,
2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
};
#define k_r8_256 LOAD_256(k_r8_arr_256)
#define k_r16_256 LOAD_256(k_r16_arr_256)
#endif
// #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1))
// #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2))
// #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE))
// #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0))
#define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4))
#endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#endif
/*
IPC(TP) ports:
1 p__5 : skl- : SSE : shufps : _mm_shuffle_ps
2 p_15 : icl+
1 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps
3 p015 : skl+
3 p015 : SSE2 : pxor : _mm_xor_si128
2 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32
2 p0_5: mrm-wsm :
3 p015 : skl+
2 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq
2 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32
2 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16
2 p_15 : : SSE2 : psrldq :
2 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8
2 p_15 : : SSE4 : pblendw : _mm_blend_epi16
1 p__5 : hsw-skl : *
1 p0 : SSE2 : pslld (i8) : _mm_slli_si128
2 p01 : skl+ :
2 p_15 : ivb- : SSE3 : palignr
1 p__5 : hsw+
2 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8)
1 p__5 + p23 : hsw-skl
1 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8)
0.5 2*p5 : hsw-skl
2 p23 : SSE2 : movd (m32)
3 p23A : adl :
1 p5: : SSE2 : movd (r32)
*/
#if 0 && defined(__XOP__)
// we must debug and test __XOP__ instruction
#include <x86intrin.h>
#include <ammintrin.h>
#define LOAD_ROTATE_CONSTS
#define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c))
#define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
#elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
#define LOAD_ROTATE_CONSTS
#define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c)
#define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
#else
// MSVC_1937+ uses "orps" instruction for _mm_or_si128().
// But "orps" has low throughput: TP=1 for bdw-nhm.
// So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps".
// But "orps" is fast for modern cpus (skl+).
// So we are default with "or" version:
#if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937
// minor optimization for some old cpus, if "xorps" is slow.
#define MM128_EPI32_OR_or_ADD _mm_add_epi32
#else
#define MM128_EPI32_OR_or_ADD _mm_or_si128
#endif
#define MM_ROR_EPI32_VIA_SHIFT(r, c)( \
MM128_EPI32_OR_or_ADD( \
_mm_srli_epi32((r), (c)), \
_mm_slli_epi32((r), 32-(c))))
#if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
#define LOAD_ROTATE_CONSTS \
const __m128i r8 = k_r8; \
const __m128i r16 = k_r16;
#define MM_ROR_EPI32(r, c) ( \
( 8==(c)) ? _mm_shuffle_epi8(r,r8) \
: (16==(c)) ? _mm_shuffle_epi8(r,r16) \
: MM_ROR_EPI32_VIA_SHIFT(r, c))
#else
#define LOAD_ROTATE_CONSTS
#define MM_ROR_EPI32(r, c) ( \
(16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \
: MM_ROR_EPI32_VIA_SHIFT(r, c))
#endif
#endif
/*
we have 3 main ways to load 4 32-bit integers to __m128i:
1) SSE2: _mm_set_epi32()
2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128()
3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128()
good compiler for _mm_set_epi32() generates these instructions:
{
movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq;
}
good new compiler generates one instruction
{
for _mm_insert_epi32() : { pinsrd xmm, [m32], i }
for _mm_cvtsi32_si128() : { movd xmm, [m32] }
}
but vc2010 generates slow pair of instructions:
{
for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i }
for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 }
}
_mm_insert_epi32() (pinsrd) code reduces xmm register pressure
in comparison with _mm_set_epi32() (movd + vpunpckld) code.
Note that variant with "movd xmm, r32" can be more slow,
but register pressure can be more important.
So we can force to "pinsrd" always.
*/
// #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86)
#ifdef Z7_BLAKE2S_USE_SSE41
/* _mm_set_epi32() can be more effective for GCC and CLANG
_mm_insert_epi32() is more effective for MSVC */
#if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
#define Z7_BLAKE2S_USE_INSERT_INSTRUCTION
#endif
#endif // USE_SSE41
// #endif
#ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION
// for SSE4.1
#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
_mm_insert_epi32( \
_mm_insert_epi32( \
_mm_insert_epi32( \
_mm_cvtsi32_si128( \
*(const Int32 *)p0), \
*(const Int32 *)p1, 1), \
*(const Int32 *)p2, 2), \
*(const Int32 *)p3, 3)
#elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
/* MSVC 1400 implements _mm_set_epi32() via slow memory write/read.
Also _mm_unpacklo_epi32 is more effective for another MSVC compilers.
But _mm_set_epi32() is more effective for GCC and CLANG.
So we use _mm_unpacklo_epi32 for MSVC only */
#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
_mm_unpacklo_epi64( \
_mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \
_mm_cvtsi32_si128(*(const Int32 *)p1)), \
_mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \
_mm_cvtsi32_si128(*(const Int32 *)p3)))
#else
#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
_mm_set_epi32( \
*(const Int32 *)p3, \
*(const Int32 *)p2, \
*(const Int32 *)p1, \
*(const Int32 *)p0)
#endif
#define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \
MM_LOAD_EPI32_FROM_4_POINTERS( \
GET_SIGMA_PTR(input, i0), \
GET_SIGMA_PTR(input, i1), \
GET_SIGMA_PTR(input, i2), \
GET_SIGMA_PTR(input, i3))
#define SET_ROW_FROM_SIGMA(input, sigma_index) \
SET_ROW_FROM_SIGMA_BASE(input, \
sigma[(sigma_index) ], \
sigma[(sigma_index) + 2 * 1], \
sigma[(sigma_index) + 2 * 2], \
sigma[(sigma_index) + 2 * 3]) \
#define ADD_128(a, b) _mm_add_epi32(a, b)
#define XOR_128(a, b) _mm_xor_si128(a, b)
#define D_ADD_128(dest, src) dest = ADD_128(dest, src)
#define D_XOR_128(dest, src) dest = XOR_128(dest, src)
#define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift)
#define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src)
#define AXR(a, b, d, shift) \
D_ADD_128(a, b); \
D_XOR_128(d, a); \
D_ROR_128(d, shift);
#define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \
a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \
AXR(a, b, d, shift1) \
AXR(c, d, b, shift2)
#define ROTATE_WORDS_TO_RIGHT(a, n) \
a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
#define AXR4(a, b, c, d, input, sigma_index) \
AXR2(a, b, c, d, input, sigma_index, 16, 12) \
AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \
#define RR2(a, b, c, d, input) \
{ \
AXR4(a, b, c, d, input, 0) \
ROTATE_WORDS_TO_RIGHT(b, 1) \
ROTATE_WORDS_TO_RIGHT(c, 2) \
ROTATE_WORDS_TO_RIGHT(d, 3) \
AXR4(a, b, c, d, input, 8) \
ROTATE_WORDS_TO_RIGHT(b, 3) \
ROTATE_WORDS_TO_RIGHT(c, 2) \
ROTATE_WORDS_TO_RIGHT(d, 1) \
}
/*
Way1:
per 64 bytes block:
10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1
* (7 + 1) = 320 cycles = if pslld TP=2 (skl+)
additional operations per 7_op_iter :
4 movzx byte mem
1 movd mem
3 pinsrd mem
1.5 pshufd
*/
static
#if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \
defined(Z7_BLAKE2S_USE_V256_WAY2))
Z7_NO_INLINE
#else
Z7_FORCE_INLINE
#endif
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input)
{
__m128i a, b, c, d;
__m128i f0, f1;
LOAD_ROTATE_CONSTS
d = LOAD_128_FROM_STRUCT(STATE_T(s));
c = k_iv0_128;
a = f0 = LOAD_128_FROM_STRUCT(s);
b = f1 = LOAD_128_FROM_STRUCT(s + 4);
D_ADD_EPI64_128(d, k_inc);
STORE_128_TO_STRUCT (STATE_T(s), d);
D_XOR_128(d, k_iv4_128);
#define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
RR2(a, b, c, d, input) }
ROUNDS_LOOP(RR)
#undef RR
STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c)));
STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d)));
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end)
{
size_t pos = 0;
do
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
Blake2s_Compress_V128_Way1(s, data);
data += Z7_BLAKE2S_BLOCK_SIZE;
pos += Z7_BLAKE2S_BLOCK_SIZE;
pos &= SUPER_BLOCK_MASK;
}
while (data != end);
}
#if defined(Z7_BLAKE2S_USE_V128_WAY2) || \
defined(Z7_BLAKE2S_USE_AVX2_WAY2)
#if 1
#define Z7_BLAKE2S_CompressSingleBlock(s, data) \
Blake2sp_Compress2_V128_Way1(s, data, \
(const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE)
#else
#define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1
#endif
#endif
#if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \
defined(Z7_BLAKE2S_USE_V128_WAY2)) && \
!defined(Z7_BLAKE2S_USE_GATHER)
#define AXR2_LOAD_INDEXES(sigma_index) \
const unsigned i0 = sigma[(sigma_index)]; \
const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \
const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \
const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \
#define SET_ROW_FROM_SIGMA_W(input) \
SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)
#endif
#ifdef Z7_BLAKE2S_USE_V128_WAY2
#if 1 || !defined(Z7_BLAKE2S_USE_SSE41)
/* we use SET_ROW_FROM_SIGMA_BASE, that uses
(SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined
(SSE2) _mm_set_epi32()
MSVC can be faster for this branch:
*/
#define AXR2_W(sigma_index, shift1, shift2) \
{ \
AXR2_LOAD_INDEXES(sigma_index) \
a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \
a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \
AXR(a0, b0, d0, shift1) \
AXR(a1, b1, d1, shift1) \
AXR(c0, d0, b0, shift2) \
AXR(c1, d1, b1, shift2) \
}
#else
/* we use interleaved _mm_insert_epi32():
GCC can be faster for this branch:
*/
#define AXR2_W_PRE_INSERT(sigma_index, i) \
{ const unsigned ii = sigma[(sigma_index) + i * 2]; \
t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \
t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \
}
#define AXR2_W(sigma_index, shift1, shift2) \
{ __m128i t0, t1; \
{ const unsigned ii = sigma[sigma_index]; \
t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \
t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \
} \
AXR2_W_PRE_INSERT(sigma_index, 1) \
AXR2_W_PRE_INSERT(sigma_index, 2) \
AXR2_W_PRE_INSERT(sigma_index, 3) \
a0 = _mm_add_epi32(a0, t0); \
a1 = _mm_add_epi32(a1, t1); \
AXR(a0, b0, d0, shift1) \
AXR(a1, b1, d1, shift1) \
AXR(c0, d0, b0, shift2) \
AXR(c1, d1, b1, shift2) \
}
#endif
#define AXR4_W(sigma_index) \
AXR2_W(sigma_index, 16, 12) \
AXR2_W(sigma_index + 1, 8, 7) \
#define WW(r) \
{ const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
AXR4_W(0) \
ROTATE_WORDS_TO_RIGHT(b0, 1) \
ROTATE_WORDS_TO_RIGHT(b1, 1) \
ROTATE_WORDS_TO_RIGHT(c0, 2) \
ROTATE_WORDS_TO_RIGHT(c1, 2) \
ROTATE_WORDS_TO_RIGHT(d0, 3) \
ROTATE_WORDS_TO_RIGHT(d1, 3) \
AXR4_W(8) \
ROTATE_WORDS_TO_RIGHT(b0, 3) \
ROTATE_WORDS_TO_RIGHT(b1, 3) \
ROTATE_WORDS_TO_RIGHT(c0, 2) \
ROTATE_WORDS_TO_RIGHT(c1, 2) \
ROTATE_WORDS_TO_RIGHT(d0, 1) \
ROTATE_WORDS_TO_RIGHT(d1, 1) \
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
{
size_t pos = 0;
end -= Z7_BLAKE2S_BLOCK_SIZE;
if (data != end)
{
LOAD_ROTATE_CONSTS
do
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
__m128i a0, b0, c0, d0;
__m128i a1, b1, c1, d1;
{
const __m128i inc = k_inc;
const __m128i temp = k_iv4_128;
d0 = LOAD_128_FROM_STRUCT (STATE_T(s));
d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
D_ADD_EPI64_128(d0, inc);
D_ADD_EPI64_128(d1, inc);
STORE_128_TO_STRUCT (STATE_T(s ), d0);
STORE_128_TO_STRUCT (STATE_T(s + NSW), d1);
D_XOR_128(d0, temp);
D_XOR_128(d1, temp);
}
c1 = c0 = k_iv0_128;
a0 = LOAD_128_FROM_STRUCT(s);
b0 = LOAD_128_FROM_STRUCT(s + 4);
a1 = LOAD_128_FROM_STRUCT(s + NSW);
b1 = LOAD_128_FROM_STRUCT(s + NSW + 4);
ROUNDS_LOOP (WW)
#undef WW
D_XOR_128(a0, c0);
D_XOR_128(b0, d0);
D_XOR_128(a1, c1);
D_XOR_128(b1, d1);
D_XOR_128(a0, LOAD_128_FROM_STRUCT(s));
D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4));
D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW));
D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4));
STORE_128_TO_STRUCT(s, a0);
STORE_128_TO_STRUCT(s + 4, b0);
STORE_128_TO_STRUCT(s + NSW, a1);
STORE_128_TO_STRUCT(s + NSW + 4, b1);
data += Z7_BLAKE2S_BLOCK_SIZE * 2;
pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
pos &= SUPER_BLOCK_MASK;
}
while (data < end);
if (data != end)
return;
}
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
Z7_BLAKE2S_CompressSingleBlock(s, data);
}
}
#endif // Z7_BLAKE2S_USE_V128_WAY2
#ifdef Z7_BLAKE2S_USE_V128_WAY2
#define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2
#else
#define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1
#endif
#ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
#define ROT_128_8(x) MM_ROR_EPI32(x, 8)
#define ROT_128_16(x) MM_ROR_EPI32(x, 16)
#define ROT_128_7(x) MM_ROR_EPI32(x, 7)
#define ROT_128_12(x) MM_ROR_EPI32(x, 12)
#else
#if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
#define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8
#define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16
#else
#define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8)
#define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16)
#endif
#define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7)
#define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12)
#endif
#if 1
// this branch can provide similar speed on x86* in most cases,
// because [base + index*4] provides same speed as [base + index].
// but some compilers can generate different code with this branch, that can be faster sometimes.
// this branch uses additional table of 10*16=160 bytes.
#define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
MY_ALIGN(16)
static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] =
{ SIGMA_TABLE(SIGMA_TABLE_MULT_16) };
#define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r];
#define GET_SIGMA_VAL_128(n) (sigma[n])
#else
#define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
#define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n])
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
#if 1
#define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
MY_ALIGN(64)
static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] =
{ SIGMA_TABLE(SIGMA_TABLE_MULT_32) };
#define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r];
#define GET_SIGMA_VAL_256(n) (sigma[n])
#else
#define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
#define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n])
#endif
#endif // Z7_BLAKE2S_USE_AVX2_FAST
#define D_ROT_128_7(dest) dest = ROT_128_7(dest)
#define D_ROT_128_8(dest) dest = ROT_128_8(dest)
#define D_ROT_128_12(dest) dest = ROT_128_12(dest)
#define D_ROT_128_16(dest) dest = ROT_128_16(dest)
#define OP_L(a, i) D_ADD_128 (V(a, 0), \
LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i))));
#define OP_0(a) OP_L(a, 0)
#define OP_7(a) OP_L(a, 1)
#define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1));
#define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0));
#define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3));
#define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2));
#define OP_3(a) D_ROT_128_16 (V(a, 3));
#define OP_6(a) D_ROT_128_12 (V(a, 1));
#define OP_8(a) D_ROT_128_8 (V(a, 3));
#define OP_9(a) D_ROT_128_7 (V(a, 1));
// for 32-bit x86 : interleave mode works slower, because of register pressure.
#if 0 || 1 && (defined(MY_CPU_X86) \
|| defined(__GNUC__) && !defined(__clang__))
// non-inteleaved version:
// is fast for x86 32-bit.
// is fast for GCC x86-64.
#define V4G(a) \
OP_0 (a) \
OP_1 (a) \
OP_2 (a) \
OP_3 (a) \
OP_4 (a) \
OP_5 (a) \
OP_6 (a) \
OP_7 (a) \
OP_1 (a) \
OP_2 (a) \
OP_8 (a) \
OP_4 (a) \
OP_5 (a) \
OP_9 (a) \
#define V4R \
{ \
V4G (0) \
V4G (1) \
V4G (2) \
V4G (3) \
V4G (4) \
V4G (5) \
V4G (6) \
V4G (7) \
}
#elif 0 || 1 && defined(MY_CPU_X86)
#define OP_INTER_2(op, a,b) \
op (a) \
op (b) \
#define V4G(a,b) \
OP_INTER_2 (OP_0, a,b) \
OP_INTER_2 (OP_1, a,b) \
OP_INTER_2 (OP_2, a,b) \
OP_INTER_2 (OP_3, a,b) \
OP_INTER_2 (OP_4, a,b) \
OP_INTER_2 (OP_5, a,b) \
OP_INTER_2 (OP_6, a,b) \
OP_INTER_2 (OP_7, a,b) \
OP_INTER_2 (OP_1, a,b) \
OP_INTER_2 (OP_2, a,b) \
OP_INTER_2 (OP_8, a,b) \
OP_INTER_2 (OP_4, a,b) \
OP_INTER_2 (OP_5, a,b) \
OP_INTER_2 (OP_9, a,b) \
#define V4R \
{ \
V4G (0, 1) \
V4G (2, 3) \
V4G (4, 5) \
V4G (6, 7) \
}
#else
// iterleave-4 version is fast for x64 (MSVC/CLANG)
#define OP_INTER_4(op, a,b,c,d) \
op (a) \