source: UsbWattMeter/trunk/wolfssl-3.7.0/wolfcrypt/src/sha512.c

Last change on this file was 167, checked in by coas-nagasima, 8 years ago

MIMEにSJISを設定

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
  • Property svn:mime-type set to text/x-csrc; charset=SHIFT_JIS
File size: 60.7 KB
Line 
1/* sha512.c
2 *
3 * Copyright (C) 2006-2015 wolfSSL Inc.
4 *
5 * This file is part of wolfSSL. (formerly known as CyaSSL)
6 *
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
20 */
21
22#ifdef HAVE_CONFIG_H
23 #include <config.h>
24#endif
25
26#include <wolfssl/wolfcrypt/settings.h>
27#include <wolfssl/wolfcrypt/sha512.h>
28
29#ifdef WOLFSSL_SHA512
30
31#ifdef HAVE_FIPS
32int wc_InitSha512(Sha512* sha)
33{
34 return InitSha512_fips(sha);
35}
36
37
38int wc_Sha512Update(Sha512* sha, const byte* data, word32 len)
39{
40 return Sha512Update_fips(sha, data, len);
41}
42
43
44int wc_Sha512Final(Sha512* sha, byte* out)
45{
46 return Sha512Final_fips(sha, out);
47}
48
49
50#if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM)
51
52int wc_InitSha384(Sha384* sha)
53{
54 return InitSha384_fips(sha);
55}
56
57
58int wc_Sha384Update(Sha384* sha, const byte* data, word32 len)
59{
60 return Sha384Update_fips(sha, data, len);
61}
62
63
64int wc_Sha384Final(Sha384* sha, byte* out)
65{
66 return Sha384Final_fips(sha, out);
67}
68
69
70#endif /* WOLFSSL_SHA384 */
71#else /* else build without using fips */
72#include <wolfssl/wolfcrypt/logging.h>
73#include <wolfssl/wolfcrypt/error-crypt.h>
74
75#ifdef NO_INLINE
76 #include <wolfssl/wolfcrypt/misc.h>
77#else
78 #include <wolfcrypt/src/misc.c>
79#endif
80
81
82#ifdef min
83#define WOLFSSL_HAVE_MIN
84#endif
85#ifndef WOLFSSL_HAVE_MIN
86#define WOLFSSL_HAVE_MIN
87
88 static INLINE word32 min(word32 a, word32 b)
89 {
90 return a > b ? b : a;
91 }
92
93#endif /* WOLFSSL_HAVE_MIN */
94
95#if defined(USE_INTEL_SPEEDUP)
96 #define HAVE_INTEL_AVX1
97 #define HAVE_INTEL_AVX2
98#endif
99
100#if defined(HAVE_INTEL_AVX1)
101/* #define DEBUG_XMM */
102#endif
103
104#if defined(HAVE_INTEL_AVX2)
105#define HAVE_INTEL_RORX
106/* #define DEBUG_YMM */
107#endif
108
109/*****
110Intel AVX1/AVX2 Macro Control Structure
111
112#if defined(HAVE_INteL_SPEEDUP)
113 #define HAVE_INTEL_AVX1
114 #define HAVE_INTEL_AVX2
115#endif
116
117int InitSha512(Sha512* sha512) {
118 Save/Recover XMM, YMM
119 ...
120
121 Check Intel AVX cpuid flags
122}
123
124#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
125 Transform_AVX1() ; # Function prototype
126 Transform_AVX2() ; #
127#endif
128
129 _Transform() { # Native Transform Function body
130
131 }
132
133 int Sha512Update() {
134 Save/Recover XMM, YMM
135 ...
136 }
137
138 int Sha512Final() {
139 Save/Recover XMM, YMM
140 ...
141 }
142
143
144#if defined(HAVE_INTEL_AVX1)
145
146 XMM Instructions/INLINE asm Definitions
147
148#endif
149
150#if defined(HAVE_INTEL_AVX2)
151
152 YMM Instructions/INLINE asm Definitions
153
154#endif
155
156#if defnied(HAVE_INTEL_AVX1)
157
158 int Transform_AVX1() {
159 Stitched Message Sched/Round
160 }
161
162#endif
163
164#if defnied(HAVE_INTEL_AVX2)
165
166 int Transform_AVX2() {
167 Stitched Message Sched/Round
168 }
169#endif
170
171
172*/
173
174#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
175
176
177/* Each platform needs to query info type 1 from cpuid to see if aesni is
178 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
179 */
180
181#ifndef _MSC_VER
182 #define cpuid(reg, leaf, sub)\
183 __asm__ __volatile__ ("cpuid":\
184 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
185 "a" (leaf), "c"(sub));
186
187 #define XASM_LINK(f) asm(f)
188#else
189
190 #include <intrin.h>
191 #define cpuid(a,b) __cpuid((int*)a,b)
192
193 #define XASM_LINK(f)
194
195#endif /* _MSC_VER */
196
197#define EAX 0
198#define EBX 1
199#define ECX 2
200#define EDX 3
201
202#define CPUID_AVX1 0x1
203#define CPUID_AVX2 0x2
204#define CPUID_RDRAND 0x4
205#define CPUID_RDSEED 0x8
206#define CPUID_BMI2 0x10 /* MULX, RORX */
207
208#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
209#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
210#define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
211#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
212#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
213
214static word32 cpuid_check = 0 ;
215static word32 cpuid_flags = 0 ;
216
217static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
218 int got_intel_cpu=0;
219 unsigned int reg[5];
220
221 reg[4] = '\0' ;
222 cpuid(reg, 0, 0);
223 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
224 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
225 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
226 got_intel_cpu = 1;
227 }
228 if (got_intel_cpu) {
229 cpuid(reg, leaf, sub);
230 return((reg[num]>>bit)&0x1) ;
231 }
232 return 0 ;
233}
234
235#define CHECK_SHA512 0x1
236#define CHECK_SHA384 0x2
237
238static int set_cpuid_flags(int sha) {
239 if((cpuid_check & sha) ==0) {
240 if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
241 if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
242 if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
243 if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
244 if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
245 cpuid_check |= sha ;
246 return 0 ;
247 }
248 return 1 ;
249}
250
251
252/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */
253
254#if defined(HAVE_INTEL_AVX1)
255static int Transform_AVX1(Sha512 *sha512) ;
256#endif
257
258#if defined(HAVE_INTEL_AVX2)
259static int Transform_AVX2(Sha512 *sha512) ;
260
261#if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
262static int Transform_AVX1_RORX(Sha512 *sha512) ;
263#endif
264
265#endif
266
267static int _Transform(Sha512 *sha512) ;
268
269static int (*Transform_p)(Sha512* sha512) = _Transform ;
270
271#define Transform(sha512) (*Transform_p)(sha512)
272
273static void set_Transform(void) {
274 if(set_cpuid_flags(CHECK_SHA512)) return ;
275
276#if defined(HAVE_INTEL_AVX2)
277 if(IS_INTEL_AVX2 && IS_INTEL_BMI2){
278 Transform_p = Transform_AVX1_RORX; return ;
279 Transform_p = Transform_AVX2 ;
280 /* for avoiding warning,"not used" */
281 }
282#endif
283#if defined(HAVE_INTEL_AVX1)
284 Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; return ;
285#endif
286 Transform_p = _Transform ; return ;
287}
288
289#else
290 #define Transform(sha512) _Transform(sha512)
291#endif
292
293/* Dummy for saving MM_REGs on behalf of Transform */
294/* #if defined(HAVE_INTEL_AVX2)
295 #define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\
296 "%ymm0","%ymm1","%ymm2","%ymm3","%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11",\
297 "%ymm12","%ymm13","%ymm14","%ymm15")
298*/
299#if defined(HAVE_INTEL_AVX1)
300 #define SAVE_XMM_YMM __asm__ volatile("orq %%r8, %%r8":::\
301 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15")
302#else
303#define SAVE_XMM_YMM
304#endif
305
306#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
307
308#include <string.h>
309
310#endif /* defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) */
311
312
313#if defined(HAVE_INTEL_RORX)
314#define ROTR(func, bits, x) \
315word64 func(word64 x) { word64 ret ;\
316 __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x):) ;\
317 return ret ;\
318}
319
320static INLINE ROTR(rotrFixed64_28, 28, x)
321static INLINE ROTR(rotrFixed64_34, 34, x)
322static INLINE ROTR(rotrFixed64_39, 39, x)
323static INLINE ROTR(rotrFixed64_14, 14, x)
324static INLINE ROTR(rotrFixed64_18, 18, x)
325static INLINE ROTR(rotrFixed64_41, 41, x)
326
327#define S0_RORX(x) (rotrFixed64_28(x)^rotrFixed64_34(x)^rotrFixed64_39(x))
328#define S1_RORX(x) (rotrFixed64_14(x)^rotrFixed64_18(x)^rotrFixed64_41(x))
329#endif
330
331#if defined(HAVE_BYTEREVERSE64) && !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
332#define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size)
333#define ByteReverseWords64_1(buf, size)\
334 { unsigned int i ;\
335 for(i=0; i< size/sizeof(word64); i++){\
336 __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\
337 }\
338}
339#endif
340
341
342int wc_InitSha512(Sha512* sha512)
343{
344 sha512->digest[0] = W64LIT(0x6a09e667f3bcc908);
345 sha512->digest[1] = W64LIT(0xbb67ae8584caa73b);
346 sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b);
347 sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1);
348 sha512->digest[4] = W64LIT(0x510e527fade682d1);
349 sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f);
350 sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b);
351 sha512->digest[7] = W64LIT(0x5be0cd19137e2179);
352
353 sha512->buffLen = 0;
354 sha512->loLen = 0;
355 sha512->hiLen = 0;
356
357#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
358 set_Transform() ; /* choose best Transform function under this runtime environment */
359#endif
360
361 return 0 ;
362}
363
364
365static const word64 K512[80] = {
366 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
367 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
368 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
369 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
370 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
371 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
372 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
373 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
374 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
375 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
376 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
377 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
378 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
379 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
380 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
381 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
382 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
383 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
384 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
385 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
386 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
387 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
388 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
389 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
390 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
391 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
392 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
393 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
394 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
395 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
396 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
397 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
398 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
399 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
400 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
401 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
402 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
403 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
404 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
405 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
406};
407
408
409
410#define blk0(i) (W[i] = sha512->buffer[i])
411
412#define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
413
414#define Ch(x,y,z) (z^(x&(y^z)))
415#define Maj(x,y,z) ((x&y)|(z&(x|y)))
416
417#define a(i) T[(0-i)&7]
418#define b(i) T[(1-i)&7]
419#define c(i) T[(2-i)&7]
420#define d(i) T[(3-i)&7]
421#define e(i) T[(4-i)&7]
422#define f(i) T[(5-i)&7]
423#define g(i) T[(6-i)&7]
424#define h(i) T[(7-i)&7]
425
426#define S0(x) (rotrFixed64(x,28)^rotrFixed64(x,34)^rotrFixed64(x,39))
427#define S1(x) (rotrFixed64(x,14)^rotrFixed64(x,18)^rotrFixed64(x,41))
428#define s0(x) (rotrFixed64(x,1)^rotrFixed64(x,8)^(x>>7))
429#define s1(x) (rotrFixed64(x,19)^rotrFixed64(x,61)^(x>>6))
430
431#define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\
432 d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
433
434#define blk384(i) (W[i] = sha384->buffer[i])
435
436#define R2(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk384(i));\
437 d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
438
439static int _Transform(Sha512* sha512)
440{
441 const word64* K = K512;
442
443 word32 j;
444 word64 T[8];
445
446
447#ifdef WOLFSSL_SMALL_STACK
448 word64* W;
449 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
450 if (W == NULL)
451 return MEMORY_E;
452#else
453 word64 W[16];
454#endif
455
456 /* Copy digest to working vars */
457 XMEMCPY(T, sha512->digest, sizeof(T));
458
459#ifdef USE_SLOW_SHA2
460 /* over twice as small, but 50% slower */
461 /* 80 operations, not unrolled */
462 for (j = 0; j < 80; j += 16) {
463 int m;
464 for (m = 0; m < 16; m++) { /* braces needed here for macros {} */
465 R(m);
466 }
467 }
468#else
469 /* 80 operations, partially loop unrolled */
470 for (j = 0; j < 80; j += 16) {
471 R( 0); R( 1); R( 2); R( 3);
472 R( 4); R( 5); R( 6); R( 7);
473 R( 8); R( 9); R(10); R(11);
474 R(12); R(13); R(14); R(15);
475 }
476#endif /* USE_SLOW_SHA2 */
477
478 /* Add the working vars back into digest */
479
480 sha512->digest[0] += a(0);
481 sha512->digest[1] += b(0);
482 sha512->digest[2] += c(0);
483 sha512->digest[3] += d(0);
484 sha512->digest[4] += e(0);
485 sha512->digest[5] += f(0);
486 sha512->digest[6] += g(0);
487 sha512->digest[7] += h(0);
488
489 /* Wipe variables */
490 ForceZero(W, sizeof(word64) * 16);
491 ForceZero(T, sizeof(T));
492
493#ifdef WOLFSSL_SMALL_STACK
494 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
495#endif
496
497 return 0;
498}
499
500
501static INLINE void AddLength(Sha512* sha512, word32 len)
502{
503 word32 tmp = sha512->loLen;
504 if ( (sha512->loLen += len) < tmp)
505 sha512->hiLen++; /* carry low to high */
506}
507
508int wc_Sha512Update(Sha512* sha512, const byte* data, word32 len)
509{
510 /* do block size increments */
511 byte* local = (byte*)sha512->buffer;
512 SAVE_XMM_YMM ; /* for Intel AVX */
513
514 while (len) {
515 word32 add = min(len, SHA512_BLOCK_SIZE - sha512->buffLen);
516 XMEMCPY(&local[sha512->buffLen], data, add);
517
518 sha512->buffLen += add;
519 data += add;
520 len -= add;
521
522 if (sha512->buffLen == SHA512_BLOCK_SIZE) {
523 int ret;
524 #if defined(LITTLE_ENDIAN_ORDER)
525 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
526 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
527 #endif
528 ByteReverseWords64(sha512->buffer, sha512->buffer,
529 SHA512_BLOCK_SIZE);
530 #endif
531 ret = Transform(sha512);
532 if (ret != 0)
533 return ret;
534
535 AddLength(sha512, SHA512_BLOCK_SIZE);
536 sha512->buffLen = 0;
537 }
538 }
539 return 0;
540}
541
542
543int wc_Sha512Final(Sha512* sha512, byte* hash)
544{
545 byte* local = (byte*)sha512->buffer;
546 int ret;
547
548 SAVE_XMM_YMM ; /* for Intel AVX */
549 AddLength(sha512, sha512->buffLen); /* before adding pads */
550
551 local[sha512->buffLen++] = 0x80; /* add 1 */
552
553 /* pad with zeros */
554 if (sha512->buffLen > SHA512_PAD_SIZE) {
555 XMEMSET(&local[sha512->buffLen], 0, SHA512_BLOCK_SIZE -sha512->buffLen);
556 sha512->buffLen += SHA512_BLOCK_SIZE - sha512->buffLen;
557 #if defined(LITTLE_ENDIAN_ORDER)
558 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
559 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
560 #endif
561 ByteReverseWords64(sha512->buffer,sha512->buffer,SHA512_BLOCK_SIZE);
562 #endif
563 ret = Transform(sha512);
564 if (ret != 0)
565 return ret;
566
567 sha512->buffLen = 0;
568 }
569 XMEMSET(&local[sha512->buffLen], 0, SHA512_PAD_SIZE - sha512->buffLen);
570
571 /* put lengths in bits */
572 sha512->hiLen = (sha512->loLen >> (8*sizeof(sha512->loLen) - 3)) +
573 (sha512->hiLen << 3);
574 sha512->loLen = sha512->loLen << 3;
575
576 /* store lengths */
577 #if defined(LITTLE_ENDIAN_ORDER)
578 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
579 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
580 #endif
581 ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE);
582 #endif
583 /* ! length ordering dependent on digest endian type ! */
584
585 sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
586 sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
587 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
588 if(IS_INTEL_AVX1 || IS_INTEL_AVX2)
589 ByteReverseWords64(&(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
590 &(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
591 SHA512_BLOCK_SIZE - SHA512_PAD_SIZE);
592 #endif
593 ret = Transform(sha512);
594 if (ret != 0)
595 return ret;
596
597 #ifdef LITTLE_ENDIAN_ORDER
598 ByteReverseWords64(sha512->digest, sha512->digest, SHA512_DIGEST_SIZE);
599 #endif
600 XMEMCPY(hash, sha512->digest, SHA512_DIGEST_SIZE);
601
602 return wc_InitSha512(sha512); /* reset state */
603}
604
605
606
607#if defined(HAVE_INTEL_AVX1)
608
609#define Rx_1(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i] ;
610#define Rx_2(i) d(i)+=h(i);
611#define Rx_3(i) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i));
612
613#if defined(HAVE_INTEL_RORX)
614#define Rx_RORX_1(i) h(i)+=S1_RORX(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i] ;
615#define Rx_RORX_2(i) d(i)+=h(i);
616#define Rx_RORX_3(i) h(i)+=S0_RORX(a(i))+Maj(a(i),b(i),c(i));
617#endif
618
619#endif
620
621#if defined(HAVE_INTEL_AVX2)
622#define Ry_1(i, w) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + w ;
623#define Ry_2(i, w) d(i)+=h(i);
624#define Ry_3(i, w) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i));
625#endif
626
627#if defined(HAVE_INTEL_AVX1) /* INLINE Assember for Intel AVX1 instructions */
628#if defined(DEBUG_XMM)
629
630#define SAVE_REG(i) __asm__ volatile("vmovdqu %%xmm"#i", %0 \n\t":"=m"(reg[i][0])::XMM_REGs);
631#define RECV_REG(i) __asm__ volatile("vmovdqu %0, %%xmm"#i" \n\t"::"m"(reg[i][0]):XMM_REGs);
632
633#define _DUMP_REG(REG, name)\
634 { word64 buf[16] ;word64 reg[16][2];int k ;\
635 SAVE_REG(0); SAVE_REG(1); SAVE_REG(2); SAVE_REG(3); SAVE_REG(4); \
636 SAVE_REG(5); SAVE_REG(6); SAVE_REG(7);SAVE_REG(8); SAVE_REG(9); SAVE_REG(10);\
637 SAVE_REG(11); SAVE_REG(12); SAVE_REG(13); SAVE_REG(14); SAVE_REG(15); \
638 __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::XMM_REGs);\
639 printf(" "#name":\t") ; for(k=0; k<2; k++) printf("%016lx.", (word64)(buf[k])); printf("\n") ; \
640 RECV_REG(0); RECV_REG(1); RECV_REG(2); RECV_REG(3); RECV_REG(4);\
641 RECV_REG(5); RECV_REG(6); RECV_REG(7); RECV_REG(8); RECV_REG(9);\
642 RECV_REG(10); RECV_REG(11); RECV_REG(12); RECV_REG(13); RECV_REG(14); RECV_REG(15);\
643 }
644
645#define DUMP_REG(REG) _DUMP_REG(REG, #REG)
646#define PRINTF(fmt, ...)
647
648#else
649
650#define DUMP_REG(REG)
651#define PRINTF(fmt, ...)
652
653#endif
654
655#define _MOVE_to_REG(xymm, mem) __asm__ volatile("vmovdqu %0, %%"#xymm" "\
656 :: "m"(mem):XMM_REGs) ;
657#define _MOVE_to_MEM(mem,i, xymm) __asm__ volatile("vmovdqu %%"#xymm", %0" :\
658 "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::XMM_REGs) ;
659#define _MOVE(dest, src) __asm__ volatile("vmovdqu %%"#src", %%"\
660 #dest" ":::XMM_REGs) ;
661
662#define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrlq $"#bits", %%"\
663 #src", %%"#dest"\n\tvpsllq $64-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
664 #temp",%%"#dest", %%"#dest" ":::XMM_REGs) ;
665#define _AVX1_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\
666 #src", %%"#dest" ":::XMM_REGs) ;
667#define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\
668 #src2", %%"#dest" ":::XMM_REGs) ;
669#define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\
670 #src2", %%"#dest" ":::XMM_REGs) ;
671#define _ADD(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\
672 #src2", %%"#dest" ":::XMM_REGs) ;
673#define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddq %0, %%"#src1", %%"\
674 #dest" "::"m"(mem):XMM_REGs) ;
675
676#define MOVE_to_REG(xymm, mem) _MOVE_to_REG(xymm, mem)
677#define MOVE_to_MEM(mem, i, xymm) _MOVE_to_MEM(mem, i, xymm)
678#define MOVE(dest, src) _MOVE(dest, src)
679
680#define XOR(dest, src1, src2) _XOR(dest, src1, src2)
681#define OR(dest, src1, src2) _OR(dest, src1, src2)
682#define ADD(dest, src1, src2) _ADD(dest, src1, src2)
683
684#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);
685#define AVX1_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP)
686#define AVX1_R(dest, src, bits) _AVX1_R(dest, src, bits)
687
688#define Init_Mask(mask) \
689 __asm__ volatile("vmovdqu %0, %%xmm1\n\t"::"m"(mask):"%xmm1") ;
690
691#define _W_from_buff1(w, buff, xmm) \
692 /* X0..3(xmm4..7), W[0..15] = sha512->buffer[0.15]; */\
693 __asm__ volatile("vmovdqu %1, %%"#xmm"\n\t"\
694 "vpshufb %%xmm1, %%"#xmm", %%"#xmm"\n\t"\
695 "vmovdqu %%"#xmm", %0"\
696 :"=m"(w): "m"(buff):"%xmm0") ;
697
698#define W_from_buff1(w, buff, xmm) _W_from_buff1(w, buff, xmm)
699
700#define W_from_buff(w, buff)\
701 Init_Mask(mBYTE_FLIP_MASK[0]) ;\
702 W_from_buff1(w[0], buff[0], W_0);\
703 W_from_buff1(w[2], buff[2], W_2);\
704 W_from_buff1(w[4], buff[4], W_4);\
705 W_from_buff1(w[6], buff[6], W_6);\
706 W_from_buff1(w[8], buff[8], W_8);\
707 W_from_buff1(w[10],buff[10],W_10);\
708 W_from_buff1(w[12],buff[12],W_12);\
709 W_from_buff1(w[14],buff[14],W_14);
710
711static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f } ;
712
713#define W_I_15 xmm14
714#define W_I_7 xmm11
715#define W_I_2 xmm13
716#define W_I xmm12
717#define G_TEMP xmm0
718#define S_TEMP xmm1
719#define XMM_TEMP0 xmm2
720
721#define W_0 xmm12
722#define W_2 xmm3
723#define W_4 xmm4
724#define W_6 xmm5
725#define W_8 xmm6
726#define W_10 xmm7
727#define W_12 xmm8
728#define W_14 xmm9
729
730#define XMM_REGs
731
732#define s0_1(dest, src) AVX1_S(dest, src, 1);
733#define s0_2(dest, src) AVX1_S(G_TEMP, src, 8); XOR(dest, G_TEMP, dest) ;
734#define s0_3(dest, src) AVX1_R(G_TEMP, src, 7); XOR(dest, G_TEMP, dest) ;
735
736#define s1_1(dest, src) AVX1_S(dest, src, 19);
737#define s1_2(dest, src) AVX1_S(G_TEMP, src, 61); XOR(dest, G_TEMP, dest) ;
738#define s1_3(dest, src) AVX1_R(G_TEMP, src, 6); XOR(dest, G_TEMP, dest) ;
739
740#define s0_(dest, src) s0_1(dest, src) ; s0_2(dest, src) ; s0_3(dest, src)
741#define s1_(dest, src) s1_1(dest, src) ; s1_2(dest, src) ; s1_3(dest, src)
742
743#define Block_xx_1(i) \
744 MOVE_to_REG(W_I_15, W_X[(i-15)&15]) ;\
745 MOVE_to_REG(W_I_7, W_X[(i- 7)&15]) ;\
746
747#define Block_xx_2(i) \
748 MOVE_to_REG(W_I_2, W_X[(i- 2)&15]) ;\
749 MOVE_to_REG(W_I, W_X[(i)]) ;\
750
751#define Block_xx_3(i) \
752 s0_ (XMM_TEMP0, W_I_15) ;\
753
754#define Block_xx_4(i) \
755 ADD(W_I, W_I, XMM_TEMP0) ;\
756 ADD(W_I, W_I, W_I_7) ;\
757
758#define Block_xx_5(i) \
759 s1_ (XMM_TEMP0, W_I_2) ;\
760
761#define Block_xx_6(i) \
762 ADD(W_I, W_I, XMM_TEMP0) ;\
763 MOVE_to_MEM(W_X,i, W_I) ;\
764 if(i==0)\
765 MOVE_to_MEM(W_X,16, W_I) ;\
766
767#define Block_xx_7(i) \
768 MOVE_to_REG(W_I_15, W_X[(i-15)&15]) ;\
769 MOVE_to_REG(W_I_7, W_X[(i- 7)&15]) ;\
770
771#define Block_xx_8(i) \
772 MOVE_to_REG(W_I_2, W_X[(i- 2)&15]) ;\
773 MOVE_to_REG(W_I, W_X[(i)]) ;\
774
775#define Block_xx_9(i) \
776 s0_ (XMM_TEMP0, W_I_15) ;\
777
778#define Block_xx_10(i) \
779 ADD(W_I, W_I, XMM_TEMP0) ;\
780 ADD(W_I, W_I, W_I_7) ;\
781
782#define Block_xx_11(i) \
783 s1_ (XMM_TEMP0, W_I_2) ;\
784
785#define Block_xx_12(i) \
786 ADD(W_I, W_I, XMM_TEMP0) ;\
787 MOVE_to_MEM(W_X,i, W_I) ;\
788 if((i)==0)\
789 MOVE_to_MEM(W_X,16, W_I) ;\
790
791static INLINE void Block_0_1(word64 *W_X) { Block_xx_1(0) ; }
792static INLINE void Block_0_2(word64 *W_X) { Block_xx_2(0) ; }
793static INLINE void Block_0_3(void) { Block_xx_3(0) ; }
794static INLINE void Block_0_4(void) { Block_xx_4(0) ; }
795static INLINE void Block_0_5(void) { Block_xx_5(0) ; }
796static INLINE void Block_0_6(word64 *W_X) { Block_xx_6(0) ; }
797static INLINE void Block_0_7(word64 *W_X) { Block_xx_7(2) ; }
798static INLINE void Block_0_8(word64 *W_X) { Block_xx_8(2) ; }
799static INLINE void Block_0_9(void) { Block_xx_9(2) ; }
800static INLINE void Block_0_10(void){ Block_xx_10(2) ; }
801static INLINE void Block_0_11(void){ Block_xx_11(2) ; }
802static INLINE void Block_0_12(word64 *W_X){ Block_xx_12(2) ; }
803
804static INLINE void Block_4_1(word64 *W_X) { Block_xx_1(4) ; }
805static INLINE void Block_4_2(word64 *W_X) { Block_xx_2(4) ; }
806static INLINE void Block_4_3(void) { Block_xx_3(4) ; }
807static INLINE void Block_4_4(void) { Block_xx_4(4) ; }
808static INLINE void Block_4_5(void) { Block_xx_5(4) ; }
809static INLINE void Block_4_6(word64 *W_X) { Block_xx_6(4) ; }
810static INLINE void Block_4_7(word64 *W_X) { Block_xx_7(6) ; }
811static INLINE void Block_4_8(word64 *W_X) { Block_xx_8(6) ; }
812static INLINE void Block_4_9(void) { Block_xx_9(6) ; }
813static INLINE void Block_4_10(void){ Block_xx_10(6) ; }
814static INLINE void Block_4_11(void){ Block_xx_11(6) ; }
815static INLINE void Block_4_12(word64 *W_X){ Block_xx_12(6) ; }
816
817static INLINE void Block_8_1(word64 *W_X) { Block_xx_1(8) ; }
818static INLINE void Block_8_2(word64 *W_X) { Block_xx_2(8) ; }
819static INLINE void Block_8_3(void) { Block_xx_3(8) ; }
820static INLINE void Block_8_4(void) { Block_xx_4(8) ; }
821static INLINE void Block_8_5(void) { Block_xx_5(8) ; }
822static INLINE void Block_8_6(word64 *W_X) { Block_xx_6(8) ; }
823static INLINE void Block_8_7(word64 *W_X) { Block_xx_7(10) ; }
824static INLINE void Block_8_8(word64 *W_X) { Block_xx_8(10) ; }
825static INLINE void Block_8_9(void) { Block_xx_9(10) ; }
826static INLINE void Block_8_10(void){ Block_xx_10(10) ; }
827static INLINE void Block_8_11(void){ Block_xx_11(10) ; }
828static INLINE void Block_8_12(word64 *W_X){ Block_xx_12(10) ; }
829
830static INLINE void Block_12_1(word64 *W_X) { Block_xx_1(12) ; }
831static INLINE void Block_12_2(word64 *W_X) { Block_xx_2(12) ; }
832static INLINE void Block_12_3(void) { Block_xx_3(12) ; }
833static INLINE void Block_12_4(void) { Block_xx_4(12) ; }
834static INLINE void Block_12_5(void) { Block_xx_5(12) ; }
835static INLINE void Block_12_6(word64 *W_X) { Block_xx_6(12) ; }
836static INLINE void Block_12_7(word64 *W_X) { Block_xx_7(14) ; }
837static INLINE void Block_12_8(word64 *W_X) { Block_xx_8(14) ; }
838static INLINE void Block_12_9(void) { Block_xx_9(14) ; }
839static INLINE void Block_12_10(void){ Block_xx_10(14) ; }
840static INLINE void Block_12_11(void){ Block_xx_11(14) ; }
841static INLINE void Block_12_12(word64 *W_X){ Block_xx_12(14) ; }
842
843#endif
844
845#if defined(HAVE_INTEL_AVX2)
846static const unsigned long mBYTE_FLIP_MASK_Y[] =
847 { 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f } ;
848
849#define W_from_buff_Y(buff)\
850 { /* X0..3(ymm9..12), W_X[0..15] = sha512->buffer[0.15]; */\
851 __asm__ volatile("vmovdqu %0, %%ymm8\n\t"::"m"(mBYTE_FLIP_MASK_Y[0]):YMM_REGs) ;\
852 __asm__ volatile("vmovdqu %0, %%ymm12\n\t"\
853 "vmovdqu %1, %%ymm4\n\t"\
854 "vpshufb %%ymm8, %%ymm12, %%ymm12\n\t"\
855 "vpshufb %%ymm8, %%ymm4, %%ymm4\n\t"\
856 :: "m"(buff[0]), "m"(buff[4]):YMM_REGs) ;\
857 __asm__ volatile("vmovdqu %0, %%ymm5\n\t"\
858 "vmovdqu %1, %%ymm6\n\t"\
859 "vpshufb %%ymm8, %%ymm5, %%ymm5\n\t"\
860 "vpshufb %%ymm8, %%ymm6, %%ymm6\n\t"\
861 :: "m"(buff[8]), "m"(buff[12]):YMM_REGs) ;\
862 }
863
864#if defined(DEBUG_YMM)
865
866#define SAVE_REG_Y(i) __asm__ volatile("vmovdqu %%ymm"#i", %0 \n\t":"=m"(reg[i-4][0])::YMM_REGs);
867#define RECV_REG_Y(i) __asm__ volatile("vmovdqu %0, %%ymm"#i" \n\t"::"m"(reg[i-4][0]):YMM_REGs);
868
869#define _DUMP_REG_Y(REG, name)\
870 { word64 buf[16] ;word64 reg[16][2];int k ;\
871 SAVE_REG_Y(4); SAVE_REG_Y(5); SAVE_REG_Y(6); SAVE_REG_Y(7); \
872 SAVE_REG_Y(8); SAVE_REG_Y(9); SAVE_REG_Y(10); SAVE_REG_Y(11); SAVE_REG_Y(12);\
873 SAVE_REG_Y(13); SAVE_REG_Y(14); SAVE_REG_Y(15); \
874 __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::YMM_REGs);\
875 printf(" "#name":\t") ; for(k=0; k<4; k++) printf("%016lx.", (word64)buf[k]) ; printf("\n") ; \
876 RECV_REG_Y(4); RECV_REG_Y(5); RECV_REG_Y(6); RECV_REG_Y(7); \
877 RECV_REG_Y(8); RECV_REG_Y(9); RECV_REG_Y(10); RECV_REG_Y(11); RECV_REG_Y(12); \
878 RECV_REG_Y(13); RECV_REG_Y(14); RECV_REG_Y(15);\
879 }
880
881#define DUMP_REG_Y(REG) _DUMP_REG_Y(REG, #REG)
882#define DUMP_REG2_Y(REG) _DUMP_REG_Y(REG, #REG)
883#define PRINTF_Y(fmt, ...)
884
885#else
886
887#define DUMP_REG_Y(REG)
888#define DUMP_REG2_Y(REG)
889#define PRINTF_Y(fmt, ...)
890
891#endif
892
893#define _MOVE_to_REGy(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" "\
894 :: "m"(mem):YMM_REGs) ;
895#define _MOVE_to_MEMy(mem,i, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" \
896 : "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::YMM_REGs) ;
897#define _MOVE_128y(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"\
898 #map", %%"#ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ;
899#define _S_TEMPy(dest, src, bits, temp) \
900 __asm__ volatile("vpsrlq $"#bits", %%"#src", %%"#dest"\n\tvpsllq $64-"#bits\
901 ", %%"#src", %%"#temp"\n\tvpor %%"#temp",%%"#dest", %%"#dest" ":::YMM_REGs) ;
902#define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrlq $"#bits", %%"\
903 #src", %%"#dest" ":::YMM_REGs) ;
904#define _XORy(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\
905 #src2", %%"#dest" ":::YMM_REGs) ;
906#define _ADDy(dest, src1, src2) __asm__ volatile("vpaddq %%"#src1", %%"\
907 #src2", %%"#dest" ":::YMM_REGs) ;
908#define _BLENDy(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\
909 #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ;
910#define _BLENDQy(map, dest, src1, src2) __asm__ volatile("vblendpd $"#map", %%"\
911 #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ;
912#define _PERMQy(map, dest, src) __asm__ volatile("vpermq $"#map", %%"\
913 #src", %%"#dest" ":::YMM_REGs) ;
914
915#define MOVE_to_REGy(ymm, mem) _MOVE_to_REGy(ymm, mem)
916#define MOVE_to_MEMy(mem, i, ymm) _MOVE_to_MEMy(mem, i, ymm)
917
918#define MOVE_128y(ymm0, ymm1, ymm2, map) _MOVE_128y(ymm0, ymm1, ymm2, map)
919#define XORy(dest, src1, src2) _XORy(dest, src1, src2)
920#define ADDy(dest, src1, src2) _ADDy(dest, src1, src2)
921#define BLENDy(map, dest, src1, src2) _BLENDy(map, dest, src1, src2)
922#define BLENDQy(map, dest, src1, src2) _BLENDQy(map, dest, src1, src2)
923#define PERMQy(map, dest, src) _PERMQy(map, dest, src)
924
925
926#define S_TMPy(dest, src, bits, temp) _S_TEMPy(dest, src, bits, temp);
927#define AVX2_S(dest, src, bits) S_TMPy(dest, src, bits, S_TEMPy)
928#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits)
929
930
931#define FEEDBACK1_to_W_I_2(w_i_2, w_i) MOVE_128y(YMM_TEMP0, w_i, w_i, 0x08) ;\
932 BLENDy(0xf0, w_i_2, YMM_TEMP0, w_i_2) ;
933
934#define MOVE_W_to_W_I_15(w_i_15, w_0, w_4) BLENDQy(0x1, w_i_15, w_4, w_0) ;\
935 PERMQy(0x39, w_i_15, w_i_15) ;
936#define MOVE_W_to_W_I_7(w_i_7, w_8, w_12) BLENDQy(0x1, w_i_7, w_12, w_8) ;\
937 PERMQy(0x39, w_i_7, w_i_7) ;
938#define MOVE_W_to_W_I_2(w_i_2, w_12) BLENDQy(0xc, w_i_2, w_12, w_i_2) ;\
939 PERMQy(0x0e, w_i_2, w_i_2) ;
940
941
942#define W_I_16y ymm8
943#define W_I_15y ymm9
944#define W_I_7y ymm10
945#define W_I_2y ymm11
946#define W_Iy ymm12
947#define G_TEMPy ymm13
948#define S_TEMPy ymm14
949#define YMM_TEMP0 ymm15
950#define YMM_TEMP0x xmm15
951#define W_I_TEMPy ymm7
952#define W_K_TEMPy ymm15
953#define W_K_TEMPx xmm15
954#define W_0y ymm12
955#define W_4y ymm4
956#define W_8y ymm5
957#define W_12y ymm6
958
959#define YMM_REGs
960/* Registers are saved in Sha512Update/Final */
961 /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
962
963#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
964 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\
965 __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\
966 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
967 __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
968 __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
969
970#define MOVE_7_to_15(w_i_15, w_i_7)\
971 __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
972
973#define MOVE_I_to_7(w_i_7, w_i)\
974 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
975 __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
976 __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\
977
978#define MOVE_I_to_2(w_i_2, w_i)\
979 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\
980 __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\
981
982#endif
983
984
985/*** Transform Body ***/
986#if defined(HAVE_INTEL_AVX1)
987
988static int Transform_AVX1(Sha512* sha512)
989{
990 const word64* K = K512;
991 word64 W_X[16+4];
992 word32 j;
993 word64 T[8];
994 /* Copy digest to working vars */
995 XMEMCPY(T, sha512->digest, sizeof(T));
996
997 W_from_buff(W_X, sha512->buffer) ;
998 for (j = 0; j < 80; j += 16) {
999 Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3();
1000 Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X);
1001 Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9();
1002 Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X);
1003
1004 Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3();
1005 Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X);
1006 Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9();
1007 Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X);
1008
1009 Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3();
1010 Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X);
1011 Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9();
1012 Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X);
1013
1014 Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3();
1015 Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X);
1016 Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9();
1017 Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X);
1018 }
1019
1020 /* Add the working vars back into digest */
1021
1022 sha512->digest[0] += a(0);
1023 sha512->digest[1] += b(0);
1024 sha512->digest[2] += c(0);
1025 sha512->digest[3] += d(0);
1026 sha512->digest[4] += e(0);
1027 sha512->digest[5] += f(0);
1028 sha512->digest[6] += g(0);
1029 sha512->digest[7] += h(0);
1030
1031 /* Wipe variables */
1032 #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
1033 XMEMSET(W_X, 0, sizeof(word64) * 16);
1034 #endif
1035 XMEMSET(T, 0, sizeof(T));
1036
1037 return 0;
1038}
1039
1040#endif
1041
1042#if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX)
1043
1044static int Transform_AVX1_RORX(Sha512* sha512)
1045{
1046 const word64* K = K512;
1047 word64 W_X[16+4];
1048 word32 j;
1049 word64 T[8];
1050 /* Copy digest to working vars */
1051 XMEMCPY(T, sha512->digest, sizeof(T));
1052
1053 W_from_buff(W_X, sha512->buffer) ;
1054 for (j = 0; j < 80; j += 16) {
1055 Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0); Block_0_2(W_X);
1056 Rx_RORX_3( 0); Block_0_3();
1057 Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); Block_0_5();
1058 Rx_RORX_3( 1); Block_0_6(W_X);
1059 Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2); Block_0_8(W_X);
1060 Rx_RORX_3( 2); Block_0_9();
1061 Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); Block_0_11();
1062 Rx_RORX_3( 3); Block_0_12(W_X);
1063
1064 Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4); Block_4_2(W_X);
1065 Rx_RORX_3( 4); Block_4_3();
1066 Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); Block_4_5();
1067 Rx_RORX_3( 5); Block_4_6(W_X);
1068 Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6); Block_4_8(W_X);
1069 Rx_RORX_3( 6); Block_4_9();
1070 Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); Block_4_11();
1071 Rx_RORX_3( 7); Block_4_12(W_X);
1072
1073 Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8); Block_8_2(W_X);
1074 Rx_RORX_3( 8); Block_8_3();
1075 Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); Block_8_5();
1076 Rx_RORX_3( 9); Block_8_6(W_X);
1077 Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10); Block_8_8(W_X);
1078 Rx_RORX_3(10); Block_8_9();
1079 Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); Block_8_11();
1080 Rx_RORX_3(11); Block_8_12(W_X);
1081
1082 Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12); Block_12_2(W_X);
1083 Rx_RORX_3(12); Block_12_3();
1084 Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); Block_12_5();
1085 Rx_RORX_3(13); Block_12_6(W_X);
1086 Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14); Block_12_8(W_X);
1087 Rx_RORX_3(14); Block_12_9();
1088 Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); Block_12_11();
1089 Rx_RORX_3(15); Block_12_12(W_X);
1090 }
1091 /* Add the working vars back into digest */
1092
1093 sha512->digest[0] += a(0);
1094 sha512->digest[1] += b(0);
1095 sha512->digest[2] += c(0);
1096 sha512->digest[3] += d(0);
1097 sha512->digest[4] += e(0);
1098 sha512->digest[5] += f(0);
1099 sha512->digest[6] += g(0);
1100 sha512->digest[7] += h(0);
1101
1102 /* Wipe variables */
1103 #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
1104 XMEMSET(W_X, 0, sizeof(word64) * 16);
1105 #endif
1106 XMEMSET(T, 0, sizeof(T));
1107
1108 return 0;
1109}
1110#endif
1111
1112#if defined(HAVE_INTEL_AVX2)
1113
1114#define s0_1y(dest, src) AVX2_S(dest, src, 1);
1115#define s0_2y(dest, src) AVX2_S(G_TEMPy, src, 8); XORy(dest, G_TEMPy, dest) ;
1116#define s0_3y(dest, src) AVX2_R(G_TEMPy, src, 7); XORy(dest, G_TEMPy, dest) ;
1117
1118#define s1_1y(dest, src) AVX2_S(dest, src, 19);
1119#define s1_2y(dest, src) AVX2_S(G_TEMPy, src, 61); XORy(dest, G_TEMPy, dest) ;
1120#define s1_3y(dest, src) AVX2_R(G_TEMPy, src, 6); XORy(dest, G_TEMPy, dest) ;
1121
1122#define s0_y(dest, src) s0_1y(dest, src) ; s0_2y(dest, src) ; s0_3y(dest, src)
1123#define s1_y(dest, src) s1_1y(dest, src) ; s1_2y(dest, src) ; s1_3y(dest, src)
1124
1125#define blk384(i) (W[i] = sha384->buffer[i])
1126
1127
1128#define Block_Y_xx_1(i, w_0, w_4, w_8, w_12)\
1129 MOVE_W_to_W_I_15(W_I_15y, w_0, w_4) ;\
1130 MOVE_W_to_W_I_7 (W_I_7y, w_8, w_12) ;\
1131 MOVE_W_to_W_I_2 (W_I_2y, w_12) ;\
1132
1133#define Block_Y_xx_2(i, w_0, w_4, w_8, w_12)\
1134 s0_1y (YMM_TEMP0, W_I_15y) ;\
1135
1136#define Block_Y_xx_3(i, w_0, w_4, w_8, w_12)\
1137 s0_2y (YMM_TEMP0, W_I_15y) ;\
1138
1139#define Block_Y_xx_4(i, w_0, w_4, w_8, w_12)\
1140 s0_3y (YMM_TEMP0, W_I_15y) ;\
1141
1142#define Block_Y_xx_5(i, w_0, w_4, w_8, w_12)\
1143 ADDy(W_I_TEMPy, w_0, YMM_TEMP0) ;\
1144
1145#define Block_Y_xx_6(i, w_0, w_4, w_8, w_12)\
1146 ADDy(W_I_TEMPy, W_I_TEMPy, W_I_7y) ;\
1147 s1_1y (YMM_TEMP0, W_I_2y) ;\
1148
1149#define Block_Y_xx_7(i, w_0, w_4, w_8, w_12)\
1150 s1_2y (YMM_TEMP0, W_I_2y) ;\
1151
1152#define Block_Y_xx_8(i, w_0, w_4, w_8, w_12)\
1153 s1_3y (YMM_TEMP0, W_I_2y) ;\
1154 ADDy(w_0, W_I_TEMPy, YMM_TEMP0) ;\
1155
1156#define Block_Y_xx_9(i, w_0, w_4, w_8, w_12)\
1157 FEEDBACK1_to_W_I_2(W_I_2y, w_0) ;\
1158
1159#define Block_Y_xx_10(i, w_0, w_4, w_8, w_12) \
1160 s1_1y (YMM_TEMP0, W_I_2y) ;\
1161
1162#define Block_Y_xx_11(i, w_0, w_4, w_8, w_12) \
1163 s1_2y (YMM_TEMP0, W_I_2y) ;\
1164
1165#define Block_Y_xx_12(i, w_0, w_4, w_8, w_12)\
1166 s1_3y (YMM_TEMP0, W_I_2y) ;\
1167 ADDy(w_0, W_I_TEMPy, YMM_TEMP0) ;\
1168 MOVE_to_MEMy(w,0, w_4) ;\
1169
1170
1171static INLINE void Block_Y_0_1(void) { Block_Y_xx_1(0, W_0y, W_4y, W_8y, W_12y) ; }
1172static INLINE void Block_Y_0_2(void) { Block_Y_xx_2(0, W_0y, W_4y, W_8y, W_12y) ; }
1173static INLINE void Block_Y_0_3(void) { Block_Y_xx_3(0, W_0y, W_4y, W_8y, W_12y) ; }
1174static INLINE void Block_Y_0_4(void) { Block_Y_xx_4(0, W_0y, W_4y, W_8y, W_12y) ; }
1175static INLINE void Block_Y_0_5(void) { Block_Y_xx_5(0, W_0y, W_4y, W_8y, W_12y) ; }
1176static INLINE void Block_Y_0_6(void) { Block_Y_xx_6(0, W_0y, W_4y, W_8y, W_12y) ; }
1177static INLINE void Block_Y_0_7(void) { Block_Y_xx_7(0, W_0y, W_4y, W_8y, W_12y) ; }
1178static INLINE void Block_Y_0_8(void) { Block_Y_xx_8(0, W_0y, W_4y, W_8y, W_12y) ; }
1179static INLINE void Block_Y_0_9(void) { Block_Y_xx_9(0, W_0y, W_4y, W_8y, W_12y) ; }
1180static INLINE void Block_Y_0_10(void){ Block_Y_xx_10(0, W_0y, W_4y, W_8y, W_12y) ; }
1181static INLINE void Block_Y_0_11(void){ Block_Y_xx_11(0, W_0y, W_4y, W_8y, W_12y) ; }
1182static INLINE void Block_Y_0_12(word64 *w){ Block_Y_xx_12(0, W_0y, W_4y, W_8y, W_12y) ; }
1183
1184static INLINE void Block_Y_4_1(void) { Block_Y_xx_1(4, W_4y, W_8y, W_12y, W_0y) ; }
1185static INLINE void Block_Y_4_2(void) { Block_Y_xx_2(4, W_4y, W_8y, W_12y, W_0y) ; }
1186static INLINE void Block_Y_4_3(void) { Block_Y_xx_3(4, W_4y, W_8y, W_12y, W_0y) ; }
1187static INLINE void Block_Y_4_4(void) { Block_Y_xx_4(4, W_4y, W_8y, W_12y, W_0y) ; }
1188static INLINE void Block_Y_4_5(void) { Block_Y_xx_5(4, W_4y, W_8y, W_12y, W_0y) ; }
1189static INLINE void Block_Y_4_6(void) { Block_Y_xx_6(4, W_4y, W_8y, W_12y, W_0y) ; }
1190static INLINE void Block_Y_4_7(void) { Block_Y_xx_7(4, W_4y, W_8y, W_12y, W_0y) ; }
1191static INLINE void Block_Y_4_8(void) { Block_Y_xx_8(4, W_4y, W_8y, W_12y, W_0y) ; }
1192static INLINE void Block_Y_4_9(void) { Block_Y_xx_9(4, W_4y, W_8y, W_12y, W_0y) ; }
1193static INLINE void Block_Y_4_10(void) { Block_Y_xx_10(4, W_4y, W_8y, W_12y, W_0y) ; }
1194static INLINE void Block_Y_4_11(void) { Block_Y_xx_11(4, W_4y, W_8y, W_12y, W_0y) ; }
1195static INLINE void Block_Y_4_12(word64 *w) { Block_Y_xx_12(4, W_4y, W_8y, W_12y, W_0y) ; }
1196
1197static INLINE void Block_Y_8_1(void) { Block_Y_xx_1(8, W_8y, W_12y, W_0y, W_4y) ; }
1198static INLINE void Block_Y_8_2(void) { Block_Y_xx_2(8, W_8y, W_12y, W_0y, W_4y) ; }
1199static INLINE void Block_Y_8_3(void) { Block_Y_xx_3(8, W_8y, W_12y, W_0y, W_4y) ; }
1200static INLINE void Block_Y_8_4(void) { Block_Y_xx_4(8, W_8y, W_12y, W_0y, W_4y) ; }
1201static INLINE void Block_Y_8_5(void) { Block_Y_xx_5(8, W_8y, W_12y, W_0y, W_4y) ; }
1202static INLINE void Block_Y_8_6(void) { Block_Y_xx_6(8, W_8y, W_12y, W_0y, W_4y) ; }
1203static INLINE void Block_Y_8_7(void) { Block_Y_xx_7(8, W_8y, W_12y, W_0y, W_4y) ; }
1204static INLINE void Block_Y_8_8(void) { Block_Y_xx_8(8, W_8y, W_12y, W_0y, W_4y) ; }
1205static INLINE void Block_Y_8_9(void) { Block_Y_xx_9(8, W_8y, W_12y, W_0y, W_4y) ; }
1206static INLINE void Block_Y_8_10(void) { Block_Y_xx_10(8, W_8y, W_12y, W_0y, W_4y) ; }
1207static INLINE void Block_Y_8_11(void) { Block_Y_xx_11(8, W_8y, W_12y, W_0y, W_4y) ; }
1208static INLINE void Block_Y_8_12(word64 *w) { Block_Y_xx_12(8, W_8y, W_12y, W_0y, W_4y) ; }
1209
1210static INLINE void Block_Y_12_1(void) { Block_Y_xx_1(12, W_12y, W_0y, W_4y, W_8y) ; }
1211static INLINE void Block_Y_12_2(void) { Block_Y_xx_2(12, W_12y, W_0y, W_4y, W_8y) ; }
1212static INLINE void Block_Y_12_3(void) { Block_Y_xx_3(12, W_12y, W_0y, W_4y, W_8y) ; }
1213static INLINE void Block_Y_12_4(void) { Block_Y_xx_4(12, W_12y, W_0y, W_4y, W_8y) ; }
1214static INLINE void Block_Y_12_5(void) { Block_Y_xx_5(12, W_12y, W_0y, W_4y, W_8y) ; }
1215static INLINE void Block_Y_12_6(void) { Block_Y_xx_6(12, W_12y, W_0y, W_4y, W_8y) ; }
1216static INLINE void Block_Y_12_7(void) { Block_Y_xx_7(12, W_12y, W_0y, W_4y, W_8y) ; }
1217static INLINE void Block_Y_12_8(void) { Block_Y_xx_8(12, W_12y, W_0y, W_4y, W_8y) ; }
1218static INLINE void Block_Y_12_9(void) { Block_Y_xx_9(12, W_12y, W_0y, W_4y, W_8y) ; }
1219static INLINE void Block_Y_12_10(void) { Block_Y_xx_10(12, W_12y, W_0y, W_4y, W_8y) ; }
1220static INLINE void Block_Y_12_11(void) { Block_Y_xx_11(12, W_12y, W_0y, W_4y, W_8y) ; }
1221static INLINE void Block_Y_12_12(word64 *w) { Block_Y_xx_12(12, W_12y, W_0y, W_4y, W_8y) ; }
1222
1223
1224static int Transform_AVX2(Sha512* sha512)
1225{
1226 const word64* K = K512;
1227 word64 w[4] ;
1228 word32 j /*, k*/;
1229 word64 T[8];
1230 /* Copy digest to working vars */
1231 XMEMCPY(T, sha512->digest, sizeof(T));
1232
1233 W_from_buff_Y(sha512->buffer) ;
1234 MOVE_to_MEMy(w,0, W_0y) ;
1235 for (j = 0; j < 80; j += 16) {
1236 Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]); Block_Y_0_2();
1237 Ry_3( 0, w[0]); Block_Y_0_3();
1238 Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]); Block_Y_0_5();
1239 Ry_3( 1, w[1]); Block_Y_0_6();
1240 Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); Block_Y_0_8();
1241 Ry_3( 2, w[2]); Block_Y_0_9();
1242 Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); Block_Y_0_11();
1243 Ry_3( 3, w[3]); Block_Y_0_12(w);
1244
1245 Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); Block_Y_4_2();
1246 Ry_3( 4, w[0]); Block_Y_4_3();
1247 Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]); Block_Y_4_5();
1248 Ry_3( 5, w[1]); Block_Y_4_6();
1249 Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); Block_Y_4_8();
1250 Ry_3( 6, w[2]); Block_Y_4_9();
1251 Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);Block_Y_4_11();
1252 Ry_3( 7, w[3]);Block_Y_4_12(w);
1253
1254 Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); Block_Y_8_2();
1255 Ry_3( 8, w[0]); Block_Y_8_3();
1256 Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]); Block_Y_8_5();
1257 Ry_3( 9, w[1]); Block_Y_8_6();
1258 Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); Block_Y_8_8();
1259 Ry_3(10, w[2]); Block_Y_8_9();
1260 Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); Block_Y_8_11();
1261 Ry_3(11, w[3]); Block_Y_8_12(w);
1262
1263 Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); Block_Y_12_2();
1264 Ry_3(12, w[0]); Block_Y_12_3();
1265 Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]); Block_Y_12_5();
1266 Ry_3(13, w[1]); Block_Y_12_6();
1267 Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); Block_Y_12_8();
1268 Ry_3(14, w[2]); Block_Y_12_9();
1269 Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); Block_Y_12_11();
1270 Ry_3(15, w[3]);Block_Y_12_12(w);
1271 }
1272
1273 /* Add the working vars back into digest */
1274
1275 sha512->digest[0] += a(0);
1276 sha512->digest[1] += b(0);
1277 sha512->digest[2] += c(0);
1278 sha512->digest[3] += d(0);
1279 sha512->digest[4] += e(0);
1280 sha512->digest[5] += f(0);
1281 sha512->digest[6] += g(0);
1282 sha512->digest[7] += h(0);
1283
1284 /* Wipe variables */
1285 #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
1286 XMEMSET(W, 0, sizeof(word64) * 16);
1287 #endif
1288 XMEMSET(T, 0, sizeof(T));
1289
1290 return 0;
1291}
1292
1293#endif
1294
1295
1296#ifdef WOLFSSL_SHA384
1297
1298#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
1299
1300#if defined(HAVE_INTEL_AVX1)
1301static int Transform384_AVX1(Sha384 *sha384) ;
1302#endif
1303#if defined(HAVE_INTEL_AVX2)
1304static int Transform384_AVX2(Sha384 *sha384) ;
1305#endif
1306
1307#if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) &&defined(HAVE_INTEL_RORX)
1308static int Transform384_AVX1_RORX(Sha384 *sha384) ;
1309#endif
1310
1311static int _Transform384(Sha384 *sha384) ;
1312static int (*Transform384_p)(Sha384* sha384) = _Transform384 ;
1313
1314#define Transform384(sha384) (*Transform384_p)(sha384)
1315static void set_Transform384(void) {
1316 if(set_cpuid_flags(CHECK_SHA384))return ;
1317
1318#if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
1319 Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ;
1320#elif defined(HAVE_INTEL_AVX2)
1321 #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX)
1322 if(IS_INTEL_AVX2 && IS_INTEL_BMI2) { Transform384_p = Transform384_AVX1_RORX ; return ; }
1323 #endif
1324 if(IS_INTEL_AVX2) { Transform384_p = Transform384_AVX2 ; return ; }
1325 #if defined(HAVE_INTEL_AVX1)
1326 Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ;
1327 #endif
1328#else
1329 Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ;
1330#endif
1331}
1332
1333#else
1334 #define Transform384(sha512) _Transform384(sha512)
1335#endif
1336
1337int wc_InitSha384(Sha384* sha384)
1338{
1339 sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8);
1340 sha384->digest[1] = W64LIT(0x629a292a367cd507);
1341 sha384->digest[2] = W64LIT(0x9159015a3070dd17);
1342 sha384->digest[3] = W64LIT(0x152fecd8f70e5939);
1343 sha384->digest[4] = W64LIT(0x67332667ffc00b31);
1344 sha384->digest[5] = W64LIT(0x8eb44a8768581511);
1345 sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7);
1346 sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4);
1347
1348 sha384->buffLen = 0;
1349 sha384->loLen = 0;
1350 sha384->hiLen = 0;
1351
1352#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
1353 set_Transform384() ;
1354#endif
1355
1356 return 0;
1357}
1358
1359static int _Transform384(Sha384* sha384)
1360{
1361 const word64* K = K512;
1362
1363 word32 j;
1364 word64 T[8];
1365
1366#ifdef WOLFSSL_SMALL_STACK
1367 word64* W;
1368
1369 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
1370 if (W == NULL)
1371 return MEMORY_E;
1372#else
1373 word64 W[16];
1374#endif
1375
1376 /* Copy digest to working vars */
1377 XMEMCPY(T, sha384->digest, sizeof(T));
1378
1379#ifdef USE_SLOW_SHA2
1380 /* over twice as small, but 50% slower */
1381 /* 80 operations, not unrolled */
1382 for (j = 0; j < 80; j += 16) {
1383 int m;
1384 for (m = 0; m < 16; m++) { /* braces needed for macros {} */
1385 R2(m);
1386 }
1387 }
1388#else
1389 /* 80 operations, partially loop unrolled */
1390 for (j = 0; j < 80; j += 16) {
1391 R2( 0); R2( 1); R2( 2); R2( 3);
1392 R2( 4); R2( 5); R2( 6); R2( 7);
1393 R2( 8); R2( 9); R2(10); R2(11);
1394 R2(12); R2(13); R2(14); R2(15);
1395 }
1396#endif /* USE_SLOW_SHA2 */
1397
1398 /* Add the working vars back into digest */
1399
1400 sha384->digest[0] += a(0);
1401 sha384->digest[1] += b(0);
1402 sha384->digest[2] += c(0);
1403 sha384->digest[3] += d(0);
1404 sha384->digest[4] += e(0);
1405 sha384->digest[5] += f(0);
1406 sha384->digest[6] += g(0);
1407 sha384->digest[7] += h(0);
1408
1409 /* Wipe variables */
1410 XMEMSET(W, 0, sizeof(word64) * 16);
1411 XMEMSET(T, 0, sizeof(T));
1412
1413#ifdef WOLFSSL_SMALL_STACK
1414 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
1415#endif
1416
1417 return 0;
1418}
1419
1420static INLINE void AddLength384(Sha384* sha384, word32 len)
1421{
1422 word32 tmp = sha384->loLen;
1423 if ( (sha384->loLen += len) < tmp)
1424 sha384->hiLen++; /* carry low to high */
1425}
1426
1427int wc_Sha384Update(Sha384* sha384, const byte* data, word32 len)
1428{
1429 /* do block size increments */
1430 byte* local = (byte*)sha384->buffer;
1431
1432 SAVE_XMM_YMM ; /* for Intel AVX */
1433
1434 while (len) {
1435 word32 add = min(len, SHA384_BLOCK_SIZE - sha384->buffLen);
1436 XMEMCPY(&local[sha384->buffLen], data, add);
1437
1438 sha384->buffLen += add;
1439 data += add;
1440 len -= add;
1441
1442 if (sha384->buffLen == SHA384_BLOCK_SIZE) {
1443 int ret;
1444
1445 #if defined(LITTLE_ENDIAN_ORDER)
1446 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
1447 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
1448 #endif
1449 ByteReverseWords64(sha384->buffer, sha384->buffer,
1450 SHA384_BLOCK_SIZE);
1451 #endif
1452 ret = Transform384(sha384);
1453 if (ret != 0)
1454 return ret;
1455
1456 AddLength384(sha384, SHA384_BLOCK_SIZE);
1457 sha384->buffLen = 0;
1458 }
1459 }
1460 return 0;
1461}
1462
1463
1464int wc_Sha384Final(Sha384* sha384, byte* hash)
1465{
1466 byte* local = (byte*)sha384->buffer;
1467 int ret;
1468
1469 SAVE_XMM_YMM ; /* for Intel AVX */
1470 AddLength384(sha384, sha384->buffLen); /* before adding pads */
1471
1472 local[sha384->buffLen++] = 0x80; /* add 1 */
1473
1474 /* pad with zeros */
1475 if (sha384->buffLen > SHA384_PAD_SIZE) {
1476 XMEMSET(&local[sha384->buffLen], 0, SHA384_BLOCK_SIZE -sha384->buffLen);
1477 sha384->buffLen += SHA384_BLOCK_SIZE - sha384->buffLen;
1478
1479 #if defined(LITTLE_ENDIAN_ORDER)
1480 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
1481 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
1482 #endif
1483 ByteReverseWords64(sha384->buffer, sha384->buffer,
1484 SHA384_BLOCK_SIZE);
1485 #endif
1486 ret = Transform384(sha384);
1487 if (ret != 0)
1488 return ret;
1489
1490 sha384->buffLen = 0;
1491 }
1492 XMEMSET(&local[sha384->buffLen], 0, SHA384_PAD_SIZE - sha384->buffLen);
1493
1494 /* put lengths in bits */
1495 sha384->hiLen = (sha384->loLen >> (8*sizeof(sha384->loLen) - 3)) +
1496 (sha384->hiLen << 3);
1497 sha384->loLen = sha384->loLen << 3;
1498
1499 /* store lengths */
1500 #if defined(LITTLE_ENDIAN_ORDER)
1501 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
1502 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
1503 #endif
1504 ByteReverseWords64(sha384->buffer, sha384->buffer,
1505 SHA384_BLOCK_SIZE);
1506 #endif
1507 /* ! length ordering dependent on digest endian type ! */
1508 sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2] = sha384->hiLen;
1509 sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 1] = sha384->loLen;
1510 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
1511 if(IS_INTEL_AVX1 || IS_INTEL_AVX2)
1512 ByteReverseWords64(&(sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2]),
1513 &(sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2]),
1514 SHA384_BLOCK_SIZE - SHA384_PAD_SIZE);
1515 #endif
1516 ret = Transform384(sha384);
1517 if (ret != 0)
1518 return ret;
1519
1520 #ifdef LITTLE_ENDIAN_ORDER
1521 ByteReverseWords64(sha384->digest, sha384->digest, SHA384_DIGEST_SIZE);
1522 #endif
1523 XMEMCPY(hash, sha384->digest, SHA384_DIGEST_SIZE);
1524
1525 return wc_InitSha384(sha384); /* reset state */
1526}
1527
1528
1529
1530#if defined(HAVE_INTEL_AVX1)
1531
1532static int Transform384_AVX1(Sha384* sha384)
1533{
1534 const word64* K = K512;
1535 word64 W_X[16+4];
1536 word32 j;
1537 word64 T[8];
1538
1539 /* Copy digest to working vars */
1540 XMEMCPY(T, sha384->digest, sizeof(T));
1541 W_from_buff(W_X, sha384->buffer) ;
1542 for (j = 0; j < 80; j += 16) {
1543 Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3();
1544 Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X);
1545 Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9();
1546 Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X);
1547
1548 Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3();
1549 Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X);
1550 Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9();
1551 Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X);
1552
1553 Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3();
1554 Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X);
1555 Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9();
1556 Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X);
1557
1558 Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3();
1559 Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X);
1560 Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9();
1561 Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X);
1562 }
1563
1564 /* Add the working vars back into digest */
1565
1566 sha384->digest[0] += a(0);
1567 sha384->digest[1] += b(0);
1568 sha384->digest[2] += c(0);
1569 sha384->digest[3] += d(0);
1570 sha384->digest[4] += e(0);
1571 sha384->digest[5] += f(0);
1572 sha384->digest[6] += g(0);
1573 sha384->digest[7] += h(0);
1574
1575 /* Wipe variables */
1576 #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
1577 XMEMSET(W, 0, sizeof(word64) * 16);
1578 #endif
1579 XMEMSET(T, 0, sizeof(T));
1580
1581 return 0;
1582}
1583
1584#endif
1585
1586#if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
1587static int Transform384_AVX1_RORX(Sha384* sha384)
1588{
1589 const word64* K = K512;
1590 word64 W_X[16+4];
1591 word32 j;
1592 word64 T[8];
1593
1594 /* Copy digest to working vars */
1595 XMEMCPY(T, sha384->digest, sizeof(T));
1596
1597 W_from_buff(W_X, sha384->buffer) ;
1598 for (j = 0; j < 80; j += 16) {
1599 Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0);
1600 Block_0_2(W_X); Rx_RORX_3( 0); Block_0_3();
1601 Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1);
1602 Block_0_5(); Rx_RORX_3( 1); Block_0_6(W_X);
1603 Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2);
1604 Block_0_8(W_X); Rx_RORX_3( 2); Block_0_9();
1605 Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3);
1606 Block_0_11();Rx_RORX_3( 3); Block_0_12(W_X);
1607
1608 Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4);
1609 Block_4_2(W_X); Rx_RORX_3( 4); Block_4_3();
1610 Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5);
1611 Block_4_5(); Rx_RORX_3( 5); Block_4_6(W_X);
1612 Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6);
1613 Block_4_8(W_X); Rx_RORX_3( 6); Block_4_9();
1614 Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7);
1615 Block_4_11();Rx_RORX_3( 7); Block_4_12(W_X);
1616
1617 Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8);
1618 Block_8_2(W_X); Rx_RORX_3( 8); Block_8_3();
1619 Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9);
1620 Block_8_5(); Rx_RORX_3( 9); Block_8_6(W_X);
1621 Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10);
1622 Block_8_8(W_X); Rx_RORX_3(10); Block_8_9();
1623 Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11);
1624 Block_8_11();Rx_RORX_3(11); Block_8_12(W_X);
1625
1626 Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12);
1627 Block_12_2(W_X); Rx_RORX_3(12); Block_12_3();
1628 Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13);
1629 Block_12_5(); Rx_RORX_3(13); Block_12_6(W_X);
1630 Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14);
1631 Block_12_8(W_X); Rx_RORX_3(14); Block_12_9();
1632 Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15);
1633 Block_12_11();Rx_RORX_3(15); Block_12_12(W_X);
1634 }
1635
1636 /* Add the working vars back into digest */
1637
1638 sha384->digest[0] += a(0);
1639 sha384->digest[1] += b(0);
1640 sha384->digest[2] += c(0);
1641 sha384->digest[3] += d(0);
1642 sha384->digest[4] += e(0);
1643 sha384->digest[5] += f(0);
1644 sha384->digest[6] += g(0);
1645 sha384->digest[7] += h(0);
1646
1647 /* Wipe variables */
1648 #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
1649 XMEMSET(W, 0, sizeof(word64) * 16);
1650 #endif
1651 XMEMSET(T, 0, sizeof(T));
1652
1653 return 0;
1654}
1655#endif
1656
1657#if defined(HAVE_INTEL_AVX2)
1658
1659static int Transform384_AVX2(Sha384* sha384)
1660{
1661 const word64* K = K512;
1662 word64 w[4] ;
1663 word32 j;
1664 word64 T[8];
1665
1666 /* Copy digest to working vars */
1667 XMEMCPY(T, sha384->digest, sizeof(T));
1668
1669 /* over twice as small, but 50% slower */
1670 /* 80 operations, not unrolled */
1671
1672 W_from_buff_Y(sha384->buffer) ;
1673
1674 MOVE_to_MEMy(w,0, W_0y) ;
1675 for (j = 0; j < 80; j += 16) {
1676 Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]);
1677 Block_Y_0_2(); Ry_3( 0, w[0]); Block_Y_0_3();
1678 Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]);
1679 Block_Y_0_5(); Ry_3( 1, w[1]); Block_Y_0_6();
1680 Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]);
1681 Block_Y_0_8(); Ry_3( 2, w[2]); Block_Y_0_9();
1682 Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]);
1683 Block_Y_0_11();Ry_3( 3, w[3]); Block_Y_0_12(w);
1684
1685 Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]);
1686 Block_Y_4_2(); Ry_3( 4, w[0]); Block_Y_4_3();
1687 Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]);
1688 Block_Y_4_5(); Ry_3( 5, w[1]); Block_Y_4_6();
1689 Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]);
1690 Block_Y_4_8(); Ry_3( 6, w[2]); Block_Y_4_9();
1691 Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);
1692 Block_Y_4_11(); Ry_3( 7, w[3]);Block_Y_4_12(w);
1693
1694 Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]);
1695 Block_Y_8_2(); Ry_3( 8, w[0]); Block_Y_8_3();
1696 Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]);
1697 Block_Y_8_5(); Ry_3( 9, w[1]); Block_Y_8_6();
1698 Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]);
1699 Block_Y_8_8(); Ry_3(10, w[2]); Block_Y_8_9();
1700 Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]);
1701 Block_Y_8_11();Ry_3(11, w[3]); Block_Y_8_12(w);
1702
1703 Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]);
1704 Block_Y_12_2(); Ry_3(12, w[0]); Block_Y_12_3();
1705 Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]);
1706 Block_Y_12_5(); Ry_3(13, w[1]); Block_Y_12_6();
1707 Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]);
1708 Block_Y_12_8(); Ry_3(14, w[2]); Block_Y_12_9();
1709 Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]);
1710 Block_Y_12_11();Ry_3(15, w[3]); Block_Y_12_12(w);
1711 }
1712
1713 /* Add the working vars back into digest */
1714
1715 sha384->digest[0] += a(0);
1716 sha384->digest[1] += b(0);
1717 sha384->digest[2] += c(0);
1718 sha384->digest[3] += d(0);
1719 sha384->digest[4] += e(0);
1720 sha384->digest[5] += f(0);
1721 sha384->digest[6] += g(0);
1722 sha384->digest[7] += h(0);
1723
1724 /* Wipe variables */
1725 XMEMSET(T, 0, sizeof(T));
1726
1727 return 0;
1728}
1729
1730#endif
1731
1732#endif /* WOLFSSL_SHA384 */
1733
1734#endif /* HAVE_FIPS */
1735
1736#endif /* WOLFSSL_SHA512 */
1737
Note: See TracBrowser for help on using the repository browser.