source: UsbWattMeter/trunk/wolfssl-3.7.0/wolfcrypt/src/sha256.c@ 164

Last change on this file since 164 was 164, checked in by coas-nagasima, 8 years ago

TOPPERS/ECNLサンプルアプリ「USB充電器電力計」を追加

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
  • Property svn:mime-type set to text/x-csrc
File size: 73.3 KB
RevLine 
[164]1/* sha256.c
2 *
3 * Copyright (C) 2006-2015 wolfSSL Inc.
4 *
5 * This file is part of wolfSSL. (formerly known as CyaSSL)
6 *
7 * wolfSSL is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * wolfSSL is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
20 */
21
22/* code submitted by raphael.huck@efixo.com */
23
24#ifdef HAVE_CONFIG_H
25 #include <config.h>
26#endif
27
28#include <wolfssl/wolfcrypt/settings.h>
29#include <wolfssl/wolfcrypt/sha256.h>
30
31#if !defined(NO_SHA256)
32#ifdef HAVE_FIPS
33
34int wc_InitSha256(Sha256* sha)
35{
36 return InitSha256_fips(sha);
37}
38
39
40int wc_Sha256Update(Sha256* sha, const byte* data, word32 len)
41{
42 return Sha256Update_fips(sha, data, len);
43}
44
45
46int wc_Sha256Final(Sha256* sha, byte* out)
47{
48 return Sha256Final_fips(sha, out);
49}
50
51
52#else /* else build without fips */
53
54#if !defined(NO_SHA256) && defined(WOLFSSL_TI_HASH)
55 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
56#else
57
58#if !defined (ALIGN32)
59 #if defined (__GNUC__)
60 #define ALIGN32 __attribute__ ( (aligned (32)))
61 #elif defined(_MSC_VER)
62 /* disable align warning, we want alignment ! */
63 #pragma warning(disable: 4324)
64 #define ALIGN32 __declspec (align (32))
65 #else
66 #define ALIGN32
67 #endif
68#endif
69
70#ifdef WOLFSSL_PIC32MZ_HASH
71#define wc_InitSha256 wc_InitSha256_sw
72#define wc_Sha256Update wc_Sha256Update_sw
73#define wc_Sha256Final wc_Sha256Final_sw
74#endif
75
76#ifdef HAVE_FIPS
77 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
78 #define FIPS_NO_WRAPPERS
79#endif
80
81#if defined(USE_INTEL_SPEEDUP)
82#define HAVE_INTEL_AVX1
83#define HAVE_INTEL_AVX2
84
85#if defined(DEBUG_XMM)
86#include "stdio.h"
87#endif
88
89#endif
90
91#if defined(HAVE_INTEL_AVX2)
92#define HAVE_INTEL_RORX
93#endif
94
95
96/*****
97Intel AVX1/AVX2 Macro Control Structure
98
99#define HAVE_INTEL_AVX1
100#define HAVE_INTEL_AVX2
101
102#define HAVE_INTEL_RORX
103
104
105int InitSha256(Sha256* sha256) {
106 Save/Recover XMM, YMM
107 ...
108}
109
110#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
111 Transform() ; Function prototype
112#else
113 Transform() { }
114 int Sha256Final() {
115 Save/Recover XMM, YMM
116 ...
117 }
118#endif
119
120#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
121 #if defined(HAVE_INTEL_RORX
122 #define RND with rorx instuction
123 #else
124 #define RND
125 #endif
126#endif
127
128#if defined(HAVE_INTEL_AVX1)
129
130 #define XMM Instructions/inline asm
131
132 int Transform() {
133 Stitched Message Sched/Round
134 }
135
136#elif defined(HAVE_INTEL_AVX2)
137
138 #define YMM Instructions/inline asm
139
140 int Transform() {
141 More granural Stitched Message Sched/Round
142 }
143
144*/
145
146
147#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
148
149/* Each platform needs to query info type 1 from cpuid to see if aesni is
150 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
151 */
152
153#ifndef _MSC_VER
154 #define cpuid(reg, leaf, sub)\
155 __asm__ __volatile__ ("cpuid":\
156 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
157 "a" (leaf), "c"(sub));
158
159 #define XASM_LINK(f) asm(f)
160#else
161
162 #include <intrin.h>
163 #define cpuid(a,b) __cpuid((int*)a,b)
164
165 #define XASM_LINK(f)
166
167#endif /* _MSC_VER */
168
169#define EAX 0
170#define EBX 1
171#define ECX 2
172#define EDX 3
173
174#define CPUID_AVX1 0x1
175#define CPUID_AVX2 0x2
176#define CPUID_RDRAND 0x4
177#define CPUID_RDSEED 0x8
178#define CPUID_BMI2 0x10 /* MULX, RORX */
179
180#define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
181#define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
182#define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
183#define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
184#define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
185
186static word32 cpuid_check = 0 ;
187static word32 cpuid_flags = 0 ;
188
189static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
190 int got_intel_cpu=0;
191 unsigned int reg[5];
192
193 reg[4] = '\0' ;
194 cpuid(reg, 0, 0);
195 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
196 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
197 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
198 got_intel_cpu = 1;
199 }
200 if (got_intel_cpu) {
201 cpuid(reg, leaf, sub);
202 return((reg[num]>>bit)&0x1) ;
203 }
204 return 0 ;
205}
206
207static int set_cpuid_flags(void) {
208 if(cpuid_check==0) {
209 if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
210 if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
211 if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
212 if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
213 if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
214 cpuid_check = 1 ;
215 return 0 ;
216 }
217 return 1 ;
218}
219
220
221/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */
222static int Transform(Sha256* sha256);
223
224#if defined(HAVE_INTEL_AVX1)
225static int Transform_AVX1(Sha256 *sha256) ;
226#endif
227#if defined(HAVE_INTEL_AVX2)
228static int Transform_AVX2(Sha256 *sha256) ;
229static int Transform_AVX1_RORX(Sha256 *sha256) ;
230#endif
231
232static int (*Transform_p)(Sha256* sha256) /* = _Transform */;
233
234#define XTRANSFORM(sha256, B) (*Transform_p)(sha256)
235
236static void set_Transform(void) {
237 if(set_cpuid_flags())return ;
238
239#if defined(HAVE_INTEL_AVX2)
240 if(IS_INTEL_AVX2 && IS_INTEL_BMI2){
241 Transform_p = Transform_AVX1_RORX; return ;
242 Transform_p = Transform_AVX2 ;
243 /* for avoiding warning,"not used" */
244 }
245#endif
246#if defined(HAVE_INTEL_AVX1)
247 Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ;
248#endif
249 Transform_p = Transform ; return ;
250}
251
252#else
253 #if defined(FREESCALE_MMCAU)
254 #define XTRANSFORM(sha256, B) Transform(sha256, B)
255 #else
256 #define XTRANSFORM(sha256, B) Transform(sha256)
257 #endif
258#endif
259
260/* Dummy for saving MM_REGs on behalf of Transform */
261#if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1)
262#define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
263 "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
264#elif defined(HAVE_INTEL_AVX1)
265#define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
266 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
267 "xmm11","xmm12","xmm13","xmm14","xmm15")
268#else
269#define SAVE_XMM_YMM
270#endif
271
272#ifdef WOLFSSL_PIC32MZ_HASH
273#define InitSha256 InitSha256_sw
274#define Sha256Update Sha256Update_sw
275#define Sha256Final Sha256Final_sw
276#endif
277
278#include <wolfssl/wolfcrypt/logging.h>
279#include <wolfssl/wolfcrypt/error-crypt.h>
280
281#ifdef NO_INLINE
282 #include <wolfssl/wolfcrypt/misc.h>
283#else
284 #include <wolfcrypt/src/misc.c>
285#endif
286
287#ifdef FREESCALE_MMCAU
288 #include "cau_api.h"
289#endif
290
291#ifdef min
292#define WOLFSSL_HAVE_MIN
293#endif
294#ifndef WOLFSSL_HAVE_MIN
295#define WOLFSSL_HAVE_MIN
296
297 static INLINE word32 min(word32 a, word32 b)
298 {
299 return a > b ? b : a;
300 }
301
302#endif /* WOLFSSL_HAVE_MIN */
303
304
305int wc_InitSha256(Sha256* sha256)
306{
307 int ret = 0;
308 #ifdef FREESCALE_MMCAU
309 ret = wolfSSL_CryptHwMutexLock();
310 if(ret != 0) {
311 return ret;
312 }
313 cau_sha256_initialize_output(sha256->digest);
314 wolfSSL_CryptHwMutexUnLock();
315 #else
316 sha256->digest[0] = 0x6A09E667L;
317 sha256->digest[1] = 0xBB67AE85L;
318 sha256->digest[2] = 0x3C6EF372L;
319 sha256->digest[3] = 0xA54FF53AL;
320 sha256->digest[4] = 0x510E527FL;
321 sha256->digest[5] = 0x9B05688CL;
322 sha256->digest[6] = 0x1F83D9ABL;
323 sha256->digest[7] = 0x5BE0CD19L;
324 #endif
325
326 sha256->buffLen = 0;
327 sha256->loLen = 0;
328 sha256->hiLen = 0;
329
330#if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
331 set_Transform() ; /* choose best Transform function under this runtime environment */
332#endif
333
334 return ret;
335}
336
337
338#if !defined(FREESCALE_MMCAU)
339static const ALIGN32 word32 K[64] = {
340 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
341 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
342 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
343 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
344 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
345 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
346 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
347 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
348 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
349 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
350 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
351 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
352 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
353};
354
355#endif
356
357#if defined(FREESCALE_MMCAU)
358
359static int Transform(Sha256* sha256, byte* buf)
360{
361 int ret = wolfSSL_CryptHwMutexLock();
362 if(ret == 0) {
363 cau_sha256_hash_n(buf, 1, sha256->digest);
364 wolfSSL_CryptHwMutexUnLock();
365 }
366 return ret;
367}
368
369#endif /* FREESCALE_MMCAU */
370
371#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
372#define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y)))
373#define R(x, n) (((x)&0xFFFFFFFFU)>>(n))
374
375#define S(x, n) rotrFixed(x, n)
376#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
377#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
378#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
379#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
380
381#define RND(a,b,c,d,e,f,g,h,i) \
382 t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \
383 t1 = Sigma0((a)) + Maj((a), (b), (c)); \
384 (d) += t0; \
385 (h) = t0 + t1;
386
387#if !defined(FREESCALE_MMCAU)
388static int Transform(Sha256* sha256)
389{
390 word32 S[8], t0, t1;
391 int i;
392
393#ifdef WOLFSSL_SMALL_STACK
394 word32* W;
395
396 W = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
397 if (W == NULL)
398 return MEMORY_E;
399#else
400 word32 W[64];
401#endif
402
403 /* Copy context->state[] to working vars */
404 for (i = 0; i < 8; i++)
405 S[i] = sha256->digest[i];
406
407 for (i = 0; i < 16; i++)
408 W[i] = sha256->buffer[i];
409
410 for (i = 16; i < 64; i++)
411 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
412
413 for (i = 0; i < 64; i += 8) {
414 RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
415 RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
416 RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
417 RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
418 RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
419 RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
420 RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
421 RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
422 }
423
424 /* Add the working vars back into digest state[] */
425 for (i = 0; i < 8; i++) {
426 sha256->digest[i] += S[i];
427 }
428
429#ifdef WOLFSSL_SMALL_STACK
430 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
431#endif
432
433 return 0;
434}
435
436#endif /* #if !defined(FREESCALE_MMCAU) */
437
438static INLINE void AddLength(Sha256* sha256, word32 len)
439{
440 word32 tmp = sha256->loLen;
441 if ( (sha256->loLen += len) < tmp)
442 sha256->hiLen++; /* carry low to high */
443}
444
445int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
446{
447
448 /* do block size increments */
449 byte* local = (byte*)sha256->buffer;
450
451 SAVE_XMM_YMM ; /* for Intel AVX */
452
453 while (len) {
454 word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
455 XMEMCPY(&local[sha256->buffLen], data, add);
456
457 sha256->buffLen += add;
458 data += add;
459 len -= add;
460
461 if (sha256->buffLen == SHA256_BLOCK_SIZE) {
462 int ret;
463
464 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
465 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
466 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
467 #endif
468 ByteReverseWords(sha256->buffer, sha256->buffer,
469 SHA256_BLOCK_SIZE);
470 #endif
471 ret = XTRANSFORM(sha256, local);
472 if (ret != 0)
473 return ret;
474
475 AddLength(sha256, SHA256_BLOCK_SIZE);
476 sha256->buffLen = 0;
477 }
478 }
479
480 return 0;
481}
482
483int wc_Sha256Final(Sha256* sha256, byte* hash)
484{
485 byte* local = (byte*)sha256->buffer;
486 int ret;
487
488 SAVE_XMM_YMM ; /* for Intel AVX */
489
490 AddLength(sha256, sha256->buffLen); /* before adding pads */
491
492 local[sha256->buffLen++] = 0x80; /* add 1 */
493
494 /* pad with zeros */
495 if (sha256->buffLen > SHA256_PAD_SIZE) {
496 XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen);
497 sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
498
499 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
500 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
501 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
502 #endif
503 ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE);
504 #endif
505
506 ret = XTRANSFORM(sha256, local);
507 if (ret != 0)
508 return ret;
509
510 sha256->buffLen = 0;
511 }
512 XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen);
513
514 /* put lengths in bits */
515 sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) +
516 (sha256->hiLen << 3);
517 sha256->loLen = sha256->loLen << 3;
518
519 /* store lengths */
520 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
521 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
522 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
523 #endif
524 ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE);
525 #endif
526 /* ! length ordering dependent on digest endian type ! */
527 XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
528 XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
529 sizeof(word32));
530
531 #if defined(FREESCALE_MMCAU) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
532 /* Kinetis requires only these bytes reversed */
533 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
534 if(IS_INTEL_AVX1 || IS_INTEL_AVX2)
535 #endif
536 ByteReverseWords(&sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)],
537 &sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)],
538 2 * sizeof(word32));
539 #endif
540
541 ret = XTRANSFORM(sha256, local);
542 if (ret != 0)
543 return ret;
544
545 #if defined(LITTLE_ENDIAN_ORDER)
546 ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE);
547 #endif
548 XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE);
549
550 return wc_InitSha256(sha256); /* reset state */
551}
552
553
554
555
556#if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
557
558#define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
559 { word32 d ;\
560 d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs) ;\
561 d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs) ;\
562 d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs) ;\
563 d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs) ;\
564 d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs) ;\
565 d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs) ;\
566 d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs) ;\
567 d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs) ;\
568}
569
570#define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
571 { word32 d ; \
572 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ; sha256->digest[0] += d;\
573 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ; sha256->digest[1] += d;\
574 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ; sha256->digest[2] += d;\
575 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ; sha256->digest[3] += d;\
576 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ; sha256->digest[4] += d;\
577 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ; sha256->digest[5] += d;\
578 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ; sha256->digest[6] += d;\
579 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ; sha256->digest[7] += d;\
580}
581
582
583#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
584 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
585
586#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
587 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
588
589
590
591
592#define S_0 %r15d
593#define S_1 %r10d
594#define S_2 %r11d
595#define S_3 %r12d
596#define S_4 %r13d
597#define S_5 %r14d
598#define S_6 %ebx
599#define S_7 %r9d
600
601#define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15"
602
603#if defined(HAVE_INTEL_RORX)
604#define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\
605__asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\
606
607#define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\
608__asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\
609__asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\
610__asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\
611
612#define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\
613__asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\
614__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\
615__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
616__asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\
617__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\
618
619#define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\
620/*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\
621__asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\
622__asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
623__asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\
624__asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\
625
626#define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\
627__asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
628__asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\
629__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\
630
631#define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\
632__asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\
633__asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\
634__asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\
635__asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\
636
637#define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\
638__asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\
639__asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\
640__asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
641
642#define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\
643__asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
644__asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
645__asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \
646__asm__ volatile("movl %r8d, "#h"\n\t");
647
648#endif
649
650#define RND_STEP_1(a,b,c,d,e,f,g,h,i)\
651__asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\
652__asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\
653__asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\
654
655#define RND_STEP_2(a,b,c,d,e,f,g,h,i)\
656__asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\
657__asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\
658__asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\
659__asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\
660
661#define RND_STEP_3(a,b,c,d,e,f,g,h,i)\
662__asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\
663__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\
664__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
665__asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\
666__asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\
667
668#define RND_STEP_4(a,b,c,d,e,f,g,h,i)\
669__asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\
670__asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
671__asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\
672__asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\
673__asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\
674__asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\
675__asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\
676
677#define RND_STEP_5(a,b,c,d,e,f,g,h,i)\
678__asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
679__asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\
680__asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\
681
682#define RND_STEP_6(a,b,c,d,e,f,g,h,i)\
683__asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\
684__asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\
685__asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\
686__asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\
687
688#define RND_STEP_7(a,b,c,d,e,f,g,h,i)\
689__asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\
690__asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\
691__asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
692
693#define RND_STEP_8(a,b,c,d,e,f,g,h,i)\
694__asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
695__asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
696 /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\
697__asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\
698 /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\
699__asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \
700 /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
701
702#define RND_X(a,b,c,d,e,f,g,h,i) \
703 RND_STEP_1(a,b,c,d,e,f,g,h,i); \
704 RND_STEP_2(a,b,c,d,e,f,g,h,i); \
705 RND_STEP_3(a,b,c,d,e,f,g,h,i); \
706 RND_STEP_4(a,b,c,d,e,f,g,h,i); \
707 RND_STEP_5(a,b,c,d,e,f,g,h,i); \
708 RND_STEP_6(a,b,c,d,e,f,g,h,i); \
709 RND_STEP_7(a,b,c,d,e,f,g,h,i); \
710 RND_STEP_8(a,b,c,d,e,f,g,h,i);
711
712#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
713#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
714#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
715#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
716#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
717#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
718#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
719#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
720
721
722#define RND_1_3(a,b,c,d,e,f,g,h,i) {\
723 RND_STEP_1(a,b,c,d,e,f,g,h,i); \
724 RND_STEP_2(a,b,c,d,e,f,g,h,i); \
725 RND_STEP_3(a,b,c,d,e,f,g,h,i); \
726}
727
728#define RND_4_6(a,b,c,d,e,f,g,h,i) {\
729 RND_STEP_4(a,b,c,d,e,f,g,h,i); \
730 RND_STEP_5(a,b,c,d,e,f,g,h,i); \
731 RND_STEP_6(a,b,c,d,e,f,g,h,i); \
732}
733
734#define RND_7_8(a,b,c,d,e,f,g,h,i) {\
735 RND_STEP_7(a,b,c,d,e,f,g,h,i); \
736 RND_STEP_8(a,b,c,d,e,f,g,h,i); \
737}
738
739#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
740#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
741#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
742#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
743#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
744#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
745#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
746#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
747
748
749#define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
750#define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
751#define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
752#define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
753#define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
754#define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
755#define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
756#define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
757
758#define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
759#define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
760#define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
761#define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
762#define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
763#define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
764#define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
765#define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
766
767#define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
768#define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
769#define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
770#define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
771#define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
772#define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
773#define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
774#define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
775
776#define FOR(cnt, init, max, inc, loop) \
777 __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):)
778#define END(cnt, init, max, inc, loop) \
779 __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::) ;
780
781#endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */
782
783#if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
784
785#define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs)
786#define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs)
787#define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs)
788#define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs)
789#define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs)
790#define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs)
791#define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs)
792#define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs)
793#define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs)
794
795#define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\
796 a,b,c,d,e,f,g,h,_i)\
797 RND_STEP_1(a,b,c,d,e,f,g,h,_i);\
798 VPALIGNR (XTMP0, X3, X2, 4) ;\
799 RND_STEP_2(a,b,c,d,e,f,g,h,_i);\
800 VPADDD (XTMP0, XTMP0, X0) ;\
801 RND_STEP_3(a,b,c,d,e,f,g,h,_i);\
802 VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\
803 RND_STEP_4(a,b,c,d,e,f,g,h,_i);\
804 VPSRLD (XTMP2, XTMP1, 7) ;\
805 RND_STEP_5(a,b,c,d,e,f,g,h,_i);\
806 VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
807 RND_STEP_6(a,b,c,d,e,f,g,h,_i);\
808 VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\
809 RND_STEP_7(a,b,c,d,e,f,g,h,_i);\
810 VPSRLD (XTMP2, XTMP1,18) ;\
811 RND_STEP_8(a,b,c,d,e,f,g,h,_i);\
812\
813 RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\
814 VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\
815 RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\
816 VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
817 RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\
818 VPXOR (XTMP3, XTMP3, XTMP1) ;\
819 RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\
820 VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
821 RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\
822 VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\
823 RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\
824 VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\
825 RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\
826 VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\
827 RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\
828\
829 RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\
830 VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\
831 RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\
832 VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
833 RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\
834 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
835 RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\
836 VPXOR (XTMP2, XTMP2, XTMP3) ;\
837 RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\
838 VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\
839 RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\
840 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\
841 RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\
842 VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\
843 RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\
844\
845 RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\
846 VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\
847 RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\
848 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\
849 RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\
850 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
851 RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\
852 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
853 RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\
854 VPXOR (XTMP2, XTMP2, XTMP3) ;\
855 RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\
856 VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\
857 RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\
858 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\
859 RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\
860 VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\
861
862#if defined(HAVE_INTEL_RORX)
863
864#define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \
865 XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\
866 RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\
867 VPALIGNR (XTMP0, X3, X2, 4) ;\
868 RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\
869 VPADDD (XTMP0, XTMP0, X0) ;\
870 RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\
871 VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\
872 RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\
873 VPSRLD (XTMP2, XTMP1, 7) ;\
874 RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\
875 VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
876 RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\
877 VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\
878 RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\
879 VPSRLD (XTMP2, XTMP1,18) ;\
880 RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\
881\
882 RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\
883 VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\
884 RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\
885 VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
886 RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\
887 VPXOR (XTMP3, XTMP3, XTMP1) ;\
888 RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\
889 VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
890 RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\
891 VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\
892 RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\
893 VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\
894 RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\
895 VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\
896 RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\
897\
898 RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\
899 VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\
900 RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\
901 VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
902 RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\
903 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
904 RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\
905 VPXOR (XTMP2, XTMP2, XTMP3) ;\
906 RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\
907 VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\
908 RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\
909 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\
910 RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\
911 VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\
912 RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\
913\
914 RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\
915 VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\
916 RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\
917 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\
918 RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\
919 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
920 RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\
921 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
922 RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\
923 VPXOR (XTMP2, XTMP2, XTMP3) ;\
924 RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\
925 VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\
926 RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\
927 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\
928 RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\
929 VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\
930
931#endif
932
933
934#define W_K_from_buff\
935 __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\
936 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\
937 :: "m"(sha256->buffer[0]):"%xmm4") ;\
938 __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\
939 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\
940 ::"m"(sha256->buffer[4]):"%xmm5") ;\
941 __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\
942 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\
943 ::"m"(sha256->buffer[8]):"%xmm6") ;\
944 __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\
945 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\
946 ::"m"(sha256->buffer[12]):"%xmm7") ;\
947
948#define _SET_W_K_XFER(reg, i)\
949 __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\
950 __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs) ;
951
952#define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
953
954static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */
955static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */
956static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
957
958
959#define _Init_Masks(mask1, mask2, mask3)\
960__asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0])) ;\
961__asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0])) ;\
962__asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])) ;
963
964#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
965 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
966
967#define X0 %xmm4
968#define X1 %xmm5
969#define X2 %xmm6
970#define X3 %xmm7
971#define X_ X0
972
973#define XTMP0 %xmm0
974#define XTMP1 %xmm1
975#define XTMP2 %xmm2
976#define XTMP3 %xmm3
977#define XTMP4 %xmm8
978#define XTMP5 %xmm9
979#define XFER %xmm10
980
981#define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */
982#define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */
983#define BYTE_FLIP_MASK %xmm13
984
985#define XMM_REGs /* Registers are saved in Sha256Update/Finel */
986 /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */
987
988static int Transform_AVX1(Sha256* sha256)
989{
990
991 word32 W_K[64] ; /* temp for W+K */
992
993 #if defined(DEBUG_XMM)
994 int i, j ;
995 word32 xmm[29][4*15] ;
996 #endif
997
998 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ;
999 W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
1000
1001 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
1002
1003 SET_W_K_XFER(X0, 0) ;
1004 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1005 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
1006 SET_W_K_XFER(X1, 4) ;
1007 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1008 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ;
1009 SET_W_K_XFER(X2, 8) ;
1010 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1011 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
1012 SET_W_K_XFER(X3, 12) ;
1013 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1014 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ;
1015 SET_W_K_XFER(X0, 16) ;
1016 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1017 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
1018 SET_W_K_XFER(X1, 20) ;
1019 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1020 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ;
1021 SET_W_K_XFER(X2, 24) ;
1022 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1023 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
1024 SET_W_K_XFER(X3, 28) ;
1025 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1026 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ;
1027 SET_W_K_XFER(X0, 32) ;
1028 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1029 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
1030 SET_W_K_XFER(X1, 36) ;
1031 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1032 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ;
1033 SET_W_K_XFER(X2, 40) ;
1034 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1035 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
1036 SET_W_K_XFER(X3, 44) ;
1037 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1038 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ;
1039
1040 SET_W_K_XFER(X0, 48) ;
1041 SET_W_K_XFER(X1, 52) ;
1042 SET_W_K_XFER(X2, 56) ;
1043 SET_W_K_XFER(X3, 60) ;
1044
1045 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
1046 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
1047 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
1048 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
1049
1050 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
1051 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
1052 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
1053 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
1054
1055 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
1056 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
1057 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
1058 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
1059
1060 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
1061 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
1062 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
1063 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
1064
1065 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
1066
1067 #if defined(DEBUG_XMM)
1068 for(i=0; i<29; i++) {
1069 for(j=0; j<4*14; j+=4)
1070 printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i,
1071 xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ;
1072 printf("\n") ;
1073 }
1074
1075 for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ;
1076 #endif
1077
1078 return 0;
1079}
1080
1081#if defined(HAVE_INTEL_RORX)
1082static int Transform_AVX1_RORX(Sha256* sha256)
1083{
1084
1085 word32 W_K[64] ; /* temp for W+K */
1086
1087 #if defined(DEBUG_XMM)
1088 int i, j ;
1089 word32 xmm[29][4*15] ;
1090 #endif
1091
1092 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ;
1093 W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
1094
1095 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
1096 SET_W_K_XFER(X0, 0) ;
1097 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1098 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
1099 SET_W_K_XFER(X1, 4) ;
1100 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1101 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ;
1102 SET_W_K_XFER(X2, 8) ;
1103 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1104 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
1105 SET_W_K_XFER(X3, 12) ;
1106 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1107 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ;
1108 SET_W_K_XFER(X0, 16) ;
1109 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1110 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
1111 SET_W_K_XFER(X1, 20) ;
1112 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1113 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ;
1114 SET_W_K_XFER(X2, 24) ;
1115 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1116 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
1117 SET_W_K_XFER(X3, 28) ;
1118 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1119 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ;
1120 SET_W_K_XFER(X0, 32) ;
1121 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1122 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
1123 SET_W_K_XFER(X1, 36) ;
1124 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1125 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ;
1126 SET_W_K_XFER(X2, 40) ;
1127 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1128 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
1129 SET_W_K_XFER(X3, 44) ;
1130 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1131 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ;
1132
1133 SET_W_K_XFER(X0, 48) ;
1134 SET_W_K_XFER(X1, 52) ;
1135 SET_W_K_XFER(X2, 56) ;
1136 SET_W_K_XFER(X3, 60) ;
1137
1138 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
1139 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
1140 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
1141 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
1142
1143 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
1144 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
1145 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
1146 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
1147
1148 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
1149 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
1150 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
1151 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
1152
1153 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
1154 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
1155 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
1156 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
1157
1158 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
1159
1160 #if defined(DEBUG_XMM)
1161 for(i=0; i<29; i++) {
1162 for(j=0; j<4*14; j+=4)
1163 printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i,
1164 xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ;
1165 printf("\n") ;
1166 }
1167
1168 for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ;
1169 #endif
1170
1171 return 0;
1172}
1173#endif /* HAVE_INTEL_RORX */
1174
1175#endif /* HAVE_INTEL_AVX1 */
1176
1177
1178#if defined(HAVE_INTEL_AVX2)
1179
1180#define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ;
1181#define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs) ;
1182#define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\
1183 :: "m"(map):YMM_REGs) ;
1184#define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\
1185 #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ;
1186#define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\
1187 #ymm0"\n\t":: "m"(map):YMM_REGs) ;
1188#define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\
1189 #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
1190 #temp",%%"#dest", %%"#dest" ":::YMM_REGs) ;
1191#define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\
1192 #src", %%"#dest" ":::YMM_REGs) ;
1193#define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\
1194 #src2", %%"#dest" ":::YMM_REGs) ;
1195#define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\
1196 #src2", %%"#dest" ":::YMM_REGs) ;
1197#define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\
1198 #src2", %%"#dest" ":::YMM_REGs) ;
1199#define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\
1200 #dest" "::"m"(mem):YMM_REGs) ;
1201#define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\
1202 #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ;
1203
1204#define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
1205#define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
1206#define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
1207#define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
1208#define _EXTRACT_XMM_4(ymm, xmm, mem)\
1209 __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;\
1210 __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
1211#define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
1212#define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
1213#define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
1214
1215#define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;
1216#define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm)
1217
1218#define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem)
1219#define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm)
1220#define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map)
1221#define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map)
1222#define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map)
1223#define XOR(dest, src1, src2) _XOR(dest, src1, src2)
1224#define OR(dest, src1, src2) _OR(dest, src1, src2)
1225#define ADD(dest, src1, src2) _ADD(dest, src1, src2)
1226#define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem)
1227#define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2)
1228
1229#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);
1230#define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP)
1231#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits)
1232
1233#define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \
1234 XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest) ;
1235#define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18);
1236#define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); \
1237 XOR(dest, G_TEMP, dest) ;
1238
1239#define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \
1240 XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest) ;
1241#define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19);
1242#define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); \
1243 XOR(dest, G_TEMP, dest) ;
1244
1245#define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]) ; \
1246 BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) ;
1247#define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ; \
1248 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]) ; BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) ;
1249#define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]) ; \
1250 BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) ;
1251
1252#define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;\
1253 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]) ; BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) ;
1254
1255#undef voitle
1256
1257#define W_I_16 ymm8
1258#define W_I_15 ymm9
1259#define W_I_7 ymm10
1260#define W_I_2 ymm11
1261#define W_I ymm12
1262#define G_TEMP ymm13
1263#define S_TEMP ymm14
1264#define YMM_TEMP0 ymm15
1265#define YMM_TEMP0x xmm15
1266#define W_I_TEMP ymm7
1267#define W_K_TEMP ymm15
1268#define W_K_TEMPx xmm15
1269
1270#define YMM_REGs /* Registers are saved in Sha256Update/Finel */
1271 /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
1272
1273
1274#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
1275 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\
1276 __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\
1277 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
1278 __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
1279 __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
1280
1281#define MOVE_7_to_15(w_i_15, w_i_7)\
1282 __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
1283
1284#define MOVE_I_to_7(w_i_7, w_i)\
1285 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
1286 __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
1287 __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\
1288
1289#define MOVE_I_to_2(w_i_2, w_i)\
1290 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\
1291 __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\
1292
1293#define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\
1294 MOVE_15_to_16(w_i_16, w_i_15, w_i_7) ; \
1295 MOVE_7_to_15(w_i_15, w_i_7) ; \
1296 MOVE_I_to_7(w_i_7, w_i) ; \
1297 MOVE_I_to_2(w_i_2, w_i) ;\
1298
1299#define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1300 { word32 d ;\
1301 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ;\
1302 sha256->digest[0] += d;\
1303 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ;\
1304 sha256->digest[1] += d;\
1305 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ;\
1306 sha256->digest[2] += d;\
1307 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ;\
1308 sha256->digest[3] += d;\
1309 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ;\
1310 sha256->digest[4] += d;\
1311 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ;\
1312 sha256->digest[5] += d;\
1313 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ;\
1314 sha256->digest[6] += d;\
1315 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ;\
1316 sha256->digest[7] += d;\
1317}
1318
1319#define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1320 { word32 d[8] ;\
1321 __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs) ;\
1322 __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs) ;\
1323 __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs) ;\
1324 __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs) ;\
1325 __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs) ;\
1326 __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs) ;\
1327 __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs) ;\
1328 __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs) ;\
1329 printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\
1330 __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs) ;\
1331 __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs) ;\
1332 __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs) ;\
1333 __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs) ;\
1334 __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs) ;\
1335 __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs) ;\
1336 __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs) ;\
1337 __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs) ;\
1338}
1339
1340
1341#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1342 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1343
1344#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1345 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1346
1347#define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1348 _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1349
1350
1351 /* Byte swap Masks to ensure that rest of the words are filled with zero's. */
1352 static const unsigned long mBYTE_FLIP_MASK_16[] =
1353 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
1354 static const unsigned long mBYTE_FLIP_MASK_15[] =
1355 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
1356 static const unsigned long mBYTE_FLIP_MASK_7 [] =
1357 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b } ;
1358 static const unsigned long mBYTE_FLIP_MASK_2 [] =
1359 { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 } ;
1360
1361 static const unsigned long mMAPtoW_I_7[] =
1362 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 } ;
1363 static const unsigned long mMAP1toW_I_2[] =
1364 { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 } ;
1365 static const unsigned long mMAP2toW_I_2[] =
1366 { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 } ;
1367 static const unsigned long mMAP3toW_I_2[] =
1368 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 } ;
1369
1370static int Transform_AVX2(Sha256* sha256)
1371{
1372
1373 #ifdef WOLFSSL_SMALL_STACK
1374 word32* W_K;
1375 W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
1376 if (W_K == NULL)
1377 return MEMORY_E;
1378 #else
1379 word32 W_K[64] ;
1380 #endif
1381
1382 MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ;
1383 MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]) ;
1384 MOVE_to_REG(W_I, sha256->buffer[8]) ; BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]) ;
1385 MOVE_to_REG(W_I_7, sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]) ;
1386 MOVE_to_REG(W_I_2, sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]) ;
1387
1388 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
1389
1390 ADD_MEM(W_K_TEMP, W_I_16, K[0]) ;
1391 MOVE_to_MEM(W_K[0], W_K_TEMP) ;
1392
1393 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
1394 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) ;
1395 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) ;
1396 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) ;
1397 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) ;
1398 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) ;
1399 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) ;
1400 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) ;
1401
1402 ADD_MEM(YMM_TEMP0, W_I, K[8]) ;
1403 MOVE_to_MEM(W_K[8], YMM_TEMP0) ;
1404
1405 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1406 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
1407 GAMMA0_1(W_I_TEMP, W_I_15) ;
1408 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
1409 GAMMA0_2(W_I_TEMP, W_I_15) ;
1410 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
1411 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
1412 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
1413 ADD(W_I, W_I_7, W_I_TEMP);
1414 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
1415 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1416 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
1417 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1418 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
1419 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
1420 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
1421 FEEDBACK1_to_W_I_2 ;
1422 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
1423 FEEDBACK_to_W_I_7 ;
1424 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
1425 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
1426 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
1427 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1428 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
1429 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1430 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
1431 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
1432 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
1433 FEEDBACK2_to_W_I_2 ;
1434 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
1435 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1436 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
1437 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1438 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
1439 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
1440 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
1441 FEEDBACK3_to_W_I_2 ;
1442 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
1443 GAMMA1(YMM_TEMP0, W_I_2) ;
1444 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
1445 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
1446 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
1447 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
1448
1449 MOVE_to_REG(YMM_TEMP0, K[16]) ;
1450 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
1451 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
1452 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
1453 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
1454 MOVE_to_MEM(W_K[16], YMM_TEMP0) ;
1455
1456 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1457 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
1458 GAMMA0_1(W_I_TEMP, W_I_15) ;
1459 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
1460 GAMMA0_2(W_I_TEMP, W_I_15) ;
1461 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
1462 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
1463 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
1464 ADD(W_I, W_I_7, W_I_TEMP);
1465 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
1466 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1467 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
1468 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1469 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
1470 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
1471 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
1472 FEEDBACK1_to_W_I_2 ;
1473 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
1474 FEEDBACK_to_W_I_7 ;
1475 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
1476 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
1477 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
1478 GAMMA1(YMM_TEMP0, W_I_2) ;
1479 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
1480 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1481 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
1482 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
1483 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
1484 FEEDBACK2_to_W_I_2 ;
1485 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
1486 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1487 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
1488 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1489 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
1490 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
1491 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
1492 FEEDBACK3_to_W_I_2 ;
1493 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
1494 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1495 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
1496 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1497 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
1498 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
1499 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
1500
1501 MOVE_to_REG(YMM_TEMP0, K[24]) ;
1502 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
1503 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
1504 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
1505 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
1506 MOVE_to_MEM(W_K[24], YMM_TEMP0) ;
1507
1508 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1509 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
1510 GAMMA0_1(W_I_TEMP, W_I_15) ;
1511 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
1512 GAMMA0_2(W_I_TEMP, W_I_15) ;
1513 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
1514 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
1515 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
1516 ADD(W_I, W_I_7, W_I_TEMP);
1517 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
1518 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1519 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
1520 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1521 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
1522 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
1523 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
1524 FEEDBACK1_to_W_I_2 ;
1525 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
1526 FEEDBACK_to_W_I_7 ;
1527 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
1528 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
1529 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
1530 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1531 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
1532 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1533 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
1534 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
1535 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
1536 FEEDBACK2_to_W_I_2 ;
1537 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
1538 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1539 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
1540 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1541 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
1542 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
1543 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
1544 FEEDBACK3_to_W_I_2 ;
1545 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
1546 GAMMA1(YMM_TEMP0, W_I_2) ;
1547 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
1548 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
1549 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
1550 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
1551
1552 MOVE_to_REG(YMM_TEMP0, K[32]) ;
1553 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
1554 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
1555 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
1556 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
1557 MOVE_to_MEM(W_K[32], YMM_TEMP0) ;
1558
1559
1560 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1561 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
1562 GAMMA0_1(W_I_TEMP, W_I_15) ;
1563 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
1564 GAMMA0_2(W_I_TEMP, W_I_15) ;
1565 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
1566 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
1567 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
1568 ADD(W_I, W_I_7, W_I_TEMP);
1569 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
1570 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1571 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
1572 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1573 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
1574 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
1575 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
1576 FEEDBACK1_to_W_I_2 ;
1577 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
1578 FEEDBACK_to_W_I_7 ;
1579 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
1580 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
1581 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
1582 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1583 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
1584 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1585 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
1586 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
1587 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
1588 FEEDBACK2_to_W_I_2 ;
1589 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
1590 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1591 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
1592 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1593 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
1594 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
1595 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
1596 FEEDBACK3_to_W_I_2 ;
1597 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
1598 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1599 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
1600 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1601 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
1602 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
1603 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
1604
1605 MOVE_to_REG(YMM_TEMP0, K[40]) ;
1606 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
1607 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
1608 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
1609 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
1610 MOVE_to_MEM(W_K[40], YMM_TEMP0) ;
1611
1612 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1613 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
1614 GAMMA0_1(W_I_TEMP, W_I_15) ;
1615 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
1616 GAMMA0_2(W_I_TEMP, W_I_15) ;
1617 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
1618 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
1619 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
1620 ADD(W_I, W_I_7, W_I_TEMP);
1621 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
1622 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1623 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
1624 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1625 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
1626 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
1627 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
1628 FEEDBACK1_to_W_I_2 ;
1629 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
1630 FEEDBACK_to_W_I_7 ;
1631 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
1632 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
1633 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
1634 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1635 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
1636 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1637 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
1638 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
1639 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
1640 FEEDBACK2_to_W_I_2 ;
1641 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
1642 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1643 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
1644 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1645 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
1646 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
1647 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
1648 FEEDBACK3_to_W_I_2 ;
1649 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
1650 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1651 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
1652 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1653 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
1654 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
1655 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
1656
1657 MOVE_to_REG(YMM_TEMP0, K[48]) ;
1658 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
1659 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
1660 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
1661 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
1662 MOVE_to_MEM(W_K[48], YMM_TEMP0) ;
1663
1664 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1665 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
1666 GAMMA0_1(W_I_TEMP, W_I_15) ;
1667 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
1668 GAMMA0_2(W_I_TEMP, W_I_15) ;
1669 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
1670 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
1671 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
1672 ADD(W_I, W_I_7, W_I_TEMP);
1673 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
1674 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1675 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
1676 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1677 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
1678 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
1679 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
1680 FEEDBACK1_to_W_I_2 ;
1681 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
1682 FEEDBACK_to_W_I_7 ;
1683 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
1684 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
1685 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
1686 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1687 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
1688 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1689 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
1690 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
1691 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
1692 FEEDBACK2_to_W_I_2 ;
1693 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
1694 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1695 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
1696 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1697 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
1698 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
1699 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
1700 FEEDBACK3_to_W_I_2 ;
1701 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
1702 GAMMA1_1(YMM_TEMP0, W_I_2) ;
1703 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
1704 GAMMA1_2(YMM_TEMP0, W_I_2) ;
1705 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
1706 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
1707 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
1708
1709 MOVE_to_REG(YMM_TEMP0, K[56]) ;
1710 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
1711 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
1712 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
1713 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
1714 MOVE_to_MEM(W_K[56], YMM_TEMP0) ;
1715
1716 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
1717 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
1718 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
1719 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
1720
1721 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
1722 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
1723 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
1724 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
1725
1726 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
1727
1728 #ifdef WOLFSSL_SMALL_STACK
1729 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
1730 #endif
1731
1732 return 0;
1733}
1734
1735#endif /* HAVE_INTEL_AVX2 */
1736
1737#endif /* HAVE_FIPS */
1738
1739#endif /* WOLFSSL_TI_HAHS */
1740
1741#endif /* NO_SHA256 */
1742
Note: See TracBrowser for help on using the repository browser.