Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: asp3_tinet_ecnl_arm/trunk/wolfssl-3.12.2/wolfcrypt/src/sha256.c@ 352

Last change on this file since 352 was 352, checked in by coas-nagasima, 6 years ago
arm向けASP3版ECNLを追加
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc;charset=UTF-8`
File size: 84.8 KB

Line
1	/* sha256.c
2	*
3	* Copyright (C) 2006-2017 wolfSSL Inc.
4	*
5	* This file is part of wolfSSL.
6	*
7	* wolfSSL is free software; you can redistribute it and/or modify
8	* it under the terms of the GNU General Public License as published by
9	* the Free Software Foundation; either version 2 of the License, or
10	* (at your option) any later version.
11	*
12	* wolfSSL is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
20	*/
21
22
23	/* code submitted by raphael.huck@efixo.com */
24
25	#ifdef HAVE_CONFIG_H
26	#include <config.h>
27	#endif
28
29	#include <wolfssl/wolfcrypt/settings.h>
30
31	#if !defined(NO_SHA256)
32
33	#include <wolfssl/wolfcrypt/sha256.h>
34	#include <wolfssl/wolfcrypt/error-crypt.h>
35	#include <wolfssl/wolfcrypt/cpuid.h>
36
37	/* fips wrapper calls, user can call direct */
38	#ifdef HAVE_FIPS
39
40	int wc_InitSha256(wc_Sha256* sha)
41	{
42	if (sha == NULL) {
43	return BAD_FUNC_ARG;
44	}
45	return InitSha256_fips(sha);
46	}
47	int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
48	{
49	(void)heap;
50	(void)devId;
51	if (sha == NULL) {
52	return BAD_FUNC_ARG;
53	}
54	return InitSha256_fips(sha);
55	}
56	int wc_Sha256Update(wc_Sha256* sha, const byte* data, word32 len)
57	{
58	if (sha == NULL \|\| (data == NULL && len > 0)) {
59	return BAD_FUNC_ARG;
60	}
61	return Sha256Update_fips(sha, data, len);
62	}
63	int wc_Sha256Final(wc_Sha256* sha, byte* out)
64	{
65	if (sha == NULL \|\| out == NULL) {
66	return BAD_FUNC_ARG;
67	}
68	return Sha256Final_fips(sha, out);
69	}
70	void wc_Sha256Free(wc_Sha256* sha)
71	{
72	(void)sha;
73	/* Not supported in FIPS */
74	}
75
76	#else /* else build without fips */
77
78
79	#if defined(WOLFSSL_TI_HASH)
80	/* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
81	#else
82
83	#include <wolfssl/wolfcrypt/logging.h>
84
85	#ifdef NO_INLINE
86	#include <wolfssl/wolfcrypt/misc.h>
87	#else
88	#define WOLFSSL_MISC_INCLUDED
89	#include <wolfcrypt/src/misc.c>
90	#endif
91
92
93	#if defined(USE_INTEL_SPEEDUP)
94	#define HAVE_INTEL_AVX1
95	#define HAVE_INTEL_AVX2
96	#endif /* USE_INTEL_SPEEDUP */
97
98	#if defined(HAVE_INTEL_AVX2)
99	#define HAVE_INTEL_RORX
100	#endif
101
102
103	static INLINE void AddLength(wc_Sha256* sha256, word32 len);
104
105	#if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH)
106	static int InitSha256(wc_Sha256* sha256)
107	{
108	int ret = 0;
109
110	if (sha256 == NULL)
111	return BAD_FUNC_ARG;
112
113	XMEMSET(sha256->digest, 0, sizeof(sha256->digest));
114	sha256->digest[0] = 0x6A09E667L;
115	sha256->digest[1] = 0xBB67AE85L;
116	sha256->digest[2] = 0x3C6EF372L;
117	sha256->digest[3] = 0xA54FF53AL;
118	sha256->digest[4] = 0x510E527FL;
119	sha256->digest[5] = 0x9B05688CL;
120	sha256->digest[6] = 0x1F83D9ABL;
121	sha256->digest[7] = 0x5BE0CD19L;
122
123	sha256->buffLen = 0;
124	sha256->loLen = 0;
125	sha256->hiLen = 0;
126
127	return ret;
128	}
129	#endif
130
131
132	/* Hardware Acceleration */
133	#if defined(HAVE_INTEL_AVX1) \|\| defined(HAVE_INTEL_AVX2)
134
135	/* in case intel instructions aren't available, plus we need the K[] global */
136	#define NEED_SOFT_SHA256
137
138	/*****
139	Intel AVX1/AVX2 Macro Control Structure
140
141	#define HAVE_INTEL_AVX1
142	#define HAVE_INTEL_AVX2
143
144	#define HAVE_INTEL_RORX
145
146
147	int InitSha256(wc_Sha256* sha256) {
148	Save/Recover XMM, YMM
149	...
150	}
151
152	#if defined(HAVE_INTEL_AVX1)\|\| defined(HAVE_INTEL_AVX2)
153	Transform(); Function prototype
154	#else
155	Transform() { }
156	int Sha256Final() {
157	Save/Recover XMM, YMM
158	...
159	}
160	#endif
161
162	#if defined(HAVE_INTEL_AVX1)\|\| defined(HAVE_INTEL_AVX2)
163	#if defined(HAVE_INTEL_RORX
164	#define RND with rorx instuction
165	#else
166	#define RND
167	#endif
168	#endif
169
170	#if defined(HAVE_INTEL_AVX1)
171
172	#define XMM Instructions/inline asm
173
174	int Transform() {
175	Stitched Message Sched/Round
176	}
177
178	#elif defined(HAVE_INTEL_AVX2)
179
180	#define YMM Instructions/inline asm
181
182	int Transform() {
183	More granural Stitched Message Sched/Round
184	}
185
186	#endif
187
188	*/
189
190	/* Each platform needs to query info type 1 from cpuid to see if aesni is
191	* supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
192	*/
193
194	/* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */
195	static int Transform(wc_Sha256* sha256);
196	#if defined(HAVE_INTEL_AVX1)
197	static int Transform_AVX1(wc_Sha256 *sha256);
198	#endif
199	#if defined(HAVE_INTEL_AVX2)
200	static int Transform_AVX2(wc_Sha256 *sha256);
201	static int Transform_AVX1_RORX(wc_Sha256 *sha256);
202	#endif
203	static int (Transform_p)(wc_Sha256 sha256) /* = _Transform */;
204	static int transform_check = 0;
205	static word32 intel_flags;
206	#define XTRANSFORM(S, B) (*Transform_p)((S))
207
208	static void Sha256_SetTransform(void)
209	{
210
211	if (transform_check)
212	return;
213
214	intel_flags = cpuid_get_flags();
215
216	#if defined(HAVE_INTEL_AVX2)
217	if (IS_INTEL_AVX2(intel_flags) && IS_INTEL_BMI2(intel_flags)) {
218	if (1)
219	Transform_p = Transform_AVX1_RORX;
220	else
221	Transform_p = Transform_AVX2;
222	}
223	else
224	#endif
225	#if defined(HAVE_INTEL_AVX1)
226	if (1) {
227	Transform_p = ((IS_INTEL_AVX1(intel_flags)) ? Transform_AVX1 :
228	Transform);
229	}
230	else
231	#endif
232	Transform_p = Transform;
233
234	transform_check = 1;
235	}
236
237	/* Dummy for saving MM_REGs on behalf of Transform */
238	#if defined(HAVE_INTEL_AVX2) && !defined(HAVE_INTEL_AVX1)
239	#define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
240	"%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
241	#elif defined(HAVE_INTEL_AVX1)
242	#define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
243	"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
244	"xmm11","xmm12","xmm13","xmm14","xmm15")
245	#endif
246
247	int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
248	{
249	int ret = 0;
250	if (sha256 == NULL)
251	return BAD_FUNC_ARG;
252
253	sha256->heap = heap;
254
255	ret = InitSha256(sha256);
256	if (ret != 0)
257	return ret;
258
259	/* choose best Transform function under this runtime environment */
260	Sha256_SetTransform();
261
262	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
263	ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
264	WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
265	#else
266	(void)devId;
267	#endif /* WOLFSSL_ASYNC_CRYPT */
268
269	return ret;
270	}
271
272	#elif defined(FREESCALE_LTC_SHA)
273	int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
274	{
275	(void)heap;
276	(void)devId;
277
278	LTC_HASH_Init(LTC_BASE, &sha256->ctx, kLTC_Sha256, NULL, 0);
279
280	return 0;
281	}
282
283	#elif defined(FREESCALE_MMCAU_SHA)
284
285	#ifdef FREESCALE_MMCAU_CLASSIC_SHA
286	#include "cau_api.h"
287	#else
288	#include "fsl_mmcau.h"
289	#endif
290
291	#define XTRANSFORM(S, B) Transform((S), (B))
292
293	int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
294	{
295	int ret = 0;
296
297	(void)heap;
298	(void)devId;
299
300	ret = wolfSSL_CryptHwMutexLock();
301	if (ret != 0) {
302	return ret;
303	}
304	#ifdef FREESCALE_MMCAU_CLASSIC_SHA
305	cau_sha256_initialize_output(sha256->digest);
306	#else
307	MMCAU_SHA256_InitializeOutput((uint32_t*)sha256->digest);
308	#endif
309	wolfSSL_CryptHwMutexUnLock();
310
311	sha256->buffLen = 0;
312	sha256->loLen = 0;
313	sha256->hiLen = 0;
314
315	return ret;
316	}
317
318	static int Transform(wc_Sha256* sha256, byte* buf)
319	{
320	int ret = wolfSSL_CryptHwMutexLock();
321	if (ret == 0) {
322	#ifdef FREESCALE_MMCAU_CLASSIC_SHA
323	cau_sha256_hash_n(buf, 1, sha256->digest);
324	#else
325	MMCAU_SHA256_HashN(buf, 1, sha256->digest);
326	#endif
327	wolfSSL_CryptHwMutexUnLock();
328	}
329	return ret;
330	}
331
332	#elif defined(WOLFSSL_PIC32MZ_HASH)
333	#include <wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h>
334
335	#elif defined(STM32_HASH)
336
337	/*
338	* STM32F2/F4/F7 hardware SHA256 support through the HASH_* API's from the
339	* Standard Peripheral Library or CubeMX (See note in README).
340	*/
341
342	/* STM32 register size, bytes */
343	#ifdef WOLFSSL_STM32_CUBEMX
344	#define SHA256_REG_SIZE SHA256_BLOCK_SIZE
345	#else
346	#define SHA256_REG_SIZE 4
347	/* STM32 struct notes:
348	* sha256->buffer = first 4 bytes used to hold partial block if needed
349	* sha256->buffLen = num bytes currently stored in sha256->buffer
350	* sha256->loLen = num bytes that have been written to STM32 FIFO
351	*/
352	#endif
353	#define SHA256_HW_TIMEOUT 0xFF
354
355	int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
356	{
357	if (sha256 == NULL)
358	return BAD_FUNC_ARG;
359
360	sha256->heap = heap;
361	XMEMSET(sha256->buffer, 0, sizeof(sha256->buffer));
362	sha256->buffLen = 0;
363	sha256->loLen = 0;
364	sha256->hiLen = 0;
365
366	/* initialize HASH peripheral */
367	#ifdef WOLFSSL_STM32_CUBEMX
368	HAL_HASH_DeInit(&sha256->hashHandle);
369	sha256->hashHandle.Init.DataType = HASH_DATATYPE_8B;
370	if (HAL_HASH_Init(&sha256->hashHandle) != HAL_OK) {
371	return ASYNC_INIT_E;
372	}
373	/* reset the hash control register */
374	/* required because Cube MX is not clearing algo bits */
375	HASH->CR &= ~HASH_CR_ALGO;
376	#else
377	HASH_DeInit();
378
379	/* reset the hash control register */
380	HASH->CR &= ~ (HASH_CR_ALGO \| HASH_CR_DATATYPE \| HASH_CR_MODE);
381
382	/* configure algo used, algo mode, datatype */
383	HASH->CR \|= (HASH_AlgoSelection_SHA256 \| HASH_AlgoMode_HASH
384	\| HASH_DataType_8b);
385
386	/* reset HASH processor */
387	HASH->CR \|= HASH_CR_INIT;
388	#endif
389
390	return 0;
391	}
392
393	int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
394	{
395	int ret = 0;
396	byte* local;
397
398	if (sha256 == NULL \|\| (data == NULL && len > 0)) {
399	return BAD_FUNC_ARG;
400	}
401
402	/* do block size increments */
403	local = (byte*)sha256->buffer;
404
405	/* check that internal buffLen is valid */
406	if (sha256->buffLen >= SHA256_REG_SIZE)
407	return BUFFER_E;
408
409	while (len) {
410	word32 add = min(len, SHA256_REG_SIZE - sha256->buffLen);
411	XMEMCPY(&local[sha256->buffLen], data, add);
412
413	sha256->buffLen += add;
414	data += add;
415	len -= add;
416
417	if (sha256->buffLen == SHA256_REG_SIZE) {
418	#ifdef WOLFSSL_STM32_CUBEMX
419	if (HAL_HASHEx_SHA256_Accumulate(
420	&sha256->hashHandle, local, SHA256_REG_SIZE) != HAL_OK) {
421	ret = ASYNC_OP_E;
422	}
423	#else
424	HASH_DataIn((uint32_t)local);
425	#endif
426
427	AddLength(sha256, SHA256_REG_SIZE);
428	sha256->buffLen = 0;
429	}
430	}
431	return ret;
432	}
433
434	int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
435	{
436	int ret = 0;
437
438	if (sha256 == NULL \|\| hash == NULL)
439	return BAD_FUNC_ARG;
440
441	#ifdef WOLFSSL_STM32_CUBEMX
442	if (HAL_HASHEx_SHA256_Start(&sha256->hashHandle,
443	(byte*)sha256->buffer, sha256->buffLen,
444	(byte*)sha256->digest, SHA256_HW_TIMEOUT) != HAL_OK) {
445	ret = ASYNC_OP_E;
446	}
447	#else
448	__IO uint16_t nbvalidbitsdata = 0;
449
450	/* finish reading any trailing bytes into FIFO */
451	if (sha256->buffLen > 0) {
452	HASH_DataIn((uint32_t)sha256->buffer);
453	AddLength(sha256, sha256->buffLen);
454	}
455
456	/* calculate number of valid bits in last word of input data */
457	nbvalidbitsdata = 8 * (sha256->loLen % SHA256_REG_SIZE);
458
459	/* configure number of valid bits in last word of the data */
460	HASH_SetLastWordValidBitsNbr(nbvalidbitsdata);
461
462	/* start HASH processor */
463	HASH_StartDigest();
464
465	/* wait until Busy flag == RESET */
466	while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {}
467
468	/* read message digest */
469	sha256->digest[0] = HASH->HR[0];
470	sha256->digest[1] = HASH->HR[1];
471	sha256->digest[2] = HASH->HR[2];
472	sha256->digest[3] = HASH->HR[3];
473	sha256->digest[4] = HASH->HR[4];
474	sha256->digest[5] = HASH_DIGEST->HR[5];
475	sha256->digest[6] = HASH_DIGEST->HR[6];
476	sha256->digest[7] = HASH_DIGEST->HR[7];
477
478	ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE);
479	#endif /* WOLFSSL_STM32_CUBEMX */
480
481	XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE);
482
483	(void)wc_InitSha256_ex(sha256, sha256->heap, INVALID_DEVID);
484
485	return ret;
486	}
487
488	#else
489	#define NEED_SOFT_SHA256
490
491	int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
492	{
493	int ret = 0;
494	if (sha256 == NULL)
495	return BAD_FUNC_ARG;
496
497	sha256->heap = heap;
498
499	ret = InitSha256(sha256);
500	if (ret != 0)
501	return ret;
502
503	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
504	ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
505	WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
506	#else
507	(void)devId;
508	#endif /* WOLFSSL_ASYNC_CRYPT */
509
510	return ret;
511	}
512	#endif /* End Hardware Acceleration */
513
514	#ifndef SAVE_XMM_YMM
515	#define SAVE_XMM_YMM
516	#endif
517
518	#ifdef NEED_SOFT_SHA256
519
520	static const ALIGN32 word32 K[64] = {
521	0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
522	0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
523	0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
524	0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
525	0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
526	0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
527	0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
528	0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
529	0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
530	0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
531	0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
532	0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
533	0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
534	};
535
536	#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
537	#define Maj(x,y,z) ((((x) \| (y)) & (z)) \| ((x) & (y)))
538	#define R(x, n) (((x) & 0xFFFFFFFFU) >> (n))
539
540	#define S(x, n) rotrFixed(x, n)
541	#define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
542	#define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
543	#define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
544	#define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
545
546	#define RND(a,b,c,d,e,f,g,h,i) \
547	t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \
548	t1 = Sigma0((a)) + Maj((a), (b), (c)); \
549	(d) += t0; \
550	(h) = t0 + t1;
551
552	#ifndef XTRANSFORM
553	#define XTRANSFORM(S, B) Transform((S))
554	#endif
555
556	static int Transform(wc_Sha256* sha256)
557	{
558	word32 S[8], t0, t1;
559	int i;
560
561	#ifdef WOLFSSL_SMALL_STACK
562	word32* W;
563
564	W = (word32)XMALLOC(sizeof(word32) WC_SHA256_BLOCK_SIZE, NULL,
565	DYNAMIC_TYPE_TMP_BUFFER);
566	if (W == NULL)
567	return MEMORY_E;
568	#else
569	word32 W[WC_SHA256_BLOCK_SIZE];
570	#endif
571
572	/* Copy context->state[] to working vars */
573	for (i = 0; i < 8; i++)
574	S[i] = sha256->digest[i];
575
576	for (i = 0; i < 16; i++)
577	W[i] = sha256->buffer[i];
578
579	for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
580	W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
581
582	for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
583	RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
584	RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
585	RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
586	RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
587	RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
588	RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
589	RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
590	RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
591	}
592
593	/* Add the working vars back into digest state[] */
594	for (i = 0; i < 8; i++) {
595	sha256->digest[i] += S[i];
596	}
597
598	#ifdef WOLFSSL_SMALL_STACK
599	XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
600	#endif
601
602	return 0;
603	}
604	#endif
605	/* End wc_ software implementation */
606
607
608	#if defined(XTRANSFORM) \|\| defined(STM32_HASH)
609	static INLINE void AddLength(wc_Sha256* sha256, word32 len)
610	{
611	word32 tmp = sha256->loLen;
612	if ( (sha256->loLen += len) < tmp)
613	sha256->hiLen++; /* carry low to high */
614	}
615	#endif
616
617
618	#ifdef XTRANSFORM
619
620	static INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
621	{
622	int ret = 0;
623	byte* local;
624
625	if (sha256 == NULL \|\| (data == NULL && len > 0)) {
626	return BAD_FUNC_ARG;
627	}
628
629	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
630	if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
631	#if defined(HAVE_INTEL_QA)
632	return IntelQaSymSha256(&sha256->asyncDev, NULL, data, len);
633	#endif
634	}
635	#endif /* WOLFSSL_ASYNC_CRYPT */
636
637	/* do block size increments */
638	local = (byte*)sha256->buffer;
639
640	/* check that internal buffLen is valid */
641	if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE)
642	return BUFFER_E;
643
644	SAVE_XMM_YMM; /* for Intel AVX */
645
646	while (len) {
647	word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
648	XMEMCPY(&local[sha256->buffLen], data, add);
649
650	sha256->buffLen += add;
651	data += add;
652	len -= add;
653
654	if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
655	#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
656	#if defined(HAVE_INTEL_AVX1) \|\| defined(HAVE_INTEL_AVX2)
657	if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
658	#endif
659	{
660	ByteReverseWords(sha256->buffer, sha256->buffer,
661	WC_SHA256_BLOCK_SIZE);
662	}
663	#endif
664	ret = XTRANSFORM(sha256, local);
665	if (ret != 0) {
666	break;
667	}
668
669	AddLength(sha256, WC_SHA256_BLOCK_SIZE);
670	sha256->buffLen = 0;
671	}
672	}
673
674	return ret;
675	}
676
677	int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
678	{
679	return Sha256Update(sha256, data, len);
680	}
681
682	static INLINE int Sha256Final(wc_Sha256* sha256)
683	{
684
685	int ret;
686	byte* local = (byte*)sha256->buffer;
687
688	if (sha256 == NULL) {
689	return BAD_FUNC_ARG;
690	}
691
692	SAVE_XMM_YMM; /* for Intel AVX */
693
694	AddLength(sha256, sha256->buffLen); /* before adding pads */
695	local[sha256->buffLen++] = 0x80; /* add 1 */
696
697	/* pad with zeros */
698	if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
699	XMEMSET(&local[sha256->buffLen], 0,
700	WC_SHA256_BLOCK_SIZE - sha256->buffLen);
701	sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
702
703	{
704	#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
705	#if defined(HAVE_INTEL_AVX1) \|\| defined(HAVE_INTEL_AVX2)
706	if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
707	#endif
708	{
709	ByteReverseWords(sha256->buffer, sha256->buffer,
710	WC_SHA256_BLOCK_SIZE);
711	}
712	#endif
713	}
714
715	ret = XTRANSFORM(sha256, local);
716	if (ret != 0)
717	return ret;
718
719	sha256->buffLen = 0;
720	}
721	XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen);
722
723	/* put lengths in bits */
724	sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) +
725	(sha256->hiLen << 3);
726	sha256->loLen = sha256->loLen << 3;
727
728	/* store lengths */
729	#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
730	#if defined(HAVE_INTEL_AVX1) \|\| defined(HAVE_INTEL_AVX2)
731	if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
732	#endif
733	{
734	ByteReverseWords(sha256->buffer, sha256->buffer,
735	WC_SHA256_BLOCK_SIZE);
736	}
737	#endif
738	/* ! length ordering dependent on digest endian type ! */
739	XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
740	XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
741	sizeof(word32));
742
743	#if defined(FREESCALE_MMCAU_SHA) \|\| defined(HAVE_INTEL_AVX1) \|\| \
744	defined(HAVE_INTEL_AVX2)
745	/* Kinetis requires only these bytes reversed */
746	#if defined(HAVE_INTEL_AVX1) \|\| defined(HAVE_INTEL_AVX2)
747	if (IS_INTEL_AVX1(intel_flags) \|\| IS_INTEL_AVX2(intel_flags))
748	#endif
749	{
750	ByteReverseWords(
751	&sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)],
752	&sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)],
753	2 * sizeof(word32));
754	}
755	#endif
756
757	return XTRANSFORM(sha256, local);
758	}
759
760	int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
761	{
762	int ret;
763
764	if (sha256 == NULL \|\| hash == NULL) {
765	return BAD_FUNC_ARG;
766	}
767
768	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
769	if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
770	#if defined(HAVE_INTEL_QA)
771	return IntelQaSymSha256(&sha256->asyncDev, hash, NULL,
772	WC_SHA256_DIGEST_SIZE);
773	#endif
774	}
775	#endif /* WOLFSSL_ASYNC_CRYPT */
776
777	ret = Sha256Final(sha256);
778	if (ret != 0)
779	return ret;
780
781	#if defined(LITTLE_ENDIAN_ORDER)
782	ByteReverseWords(sha256->digest, sha256->digest, WC_SHA256_DIGEST_SIZE);
783	#endif
784	XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
785
786	return InitSha256(sha256); /* reset state */
787	}
788
789	#endif /* XTRANSFORM */
790
791
792	#if defined(HAVE_INTEL_AVX1) \|\| defined(HAVE_INTEL_AVX2)
793
794	#define _DigestToReg(S0, S1, S2, S3, S4, S5, S6, S7) \
795	"leaq %[digest], %%r8\n\t" \
796	"movl (%%r8), %"#S0"\n\t" \
797	"movl 4(%%r8), %"#S1"\n\t" \
798	"movl 8(%%r8), %"#S2"\n\t" \
799	"movl 12(%%r8), %"#S3"\n\t" \
800	"movl 16(%%r8), %"#S4"\n\t" \
801	"movl 20(%%r8), %"#S5"\n\t" \
802	"movl 24(%%r8), %"#S6"\n\t" \
803	"movl 28(%%r8), %"#S7"\n\t"
804
805	#define _RegToDigest(S0, S1, S2, S3, S4, S5, S6, S7) \
806	"leaq %[digest], %%r8\n\t" \
807	"addl %"#S0", (%%r8)\n\t" \
808	"addl %"#S1", 4(%%r8)\n\t" \
809	"addl %"#S2", 8(%%r8)\n\t" \
810	"addl %"#S3", 12(%%r8)\n\t" \
811	"addl %"#S4", 16(%%r8)\n\t" \
812	"addl %"#S5", 20(%%r8)\n\t" \
813	"addl %"#S6", 24(%%r8)\n\t" \
814	"addl %"#S7", 28(%%r8)\n\t"
815
816	#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
817	_DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
818
819	#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
820	_RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
821
822
823	#define S_0 %r15d
824	#define S_1 %r10d
825	#define S_2 %r11d
826	#define S_3 %r12d
827	#define S_4 %r13d
828	#define S_5 %r14d
829	#define S_6 %ebx
830	#define S_7 %r9d
831
832	#define SSE_REGs "%edi", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15"
833
834	#if defined(HAVE_INTEL_RORX)
835	#define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i) \
836	"# edx = e>>>6\n\t" \
837	"rorx $6, %"#e", %%edx\n\t"
838
839	#define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i) \
840	"# edi = e>>>11\n\t" \
841	"rorx $11, %"#e",%%edi\n\t" \
842	"# edi = (e>>11) ^ (e>>6)\n\t" \
843	"xorl %%edx, %%edi\n\t" \
844	"# edx = e>>>25\n\t" \
845	"rorx $25, %"#e", %%edx\n\t"
846
847	#define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i) \
848	"# esi = f\n\t" \
849	"movl %"#f", %%esi\n\t" \
850	"# esi = f ^ g\n\t" \
851	"xorl %"#g", %%esi\n\t" \
852	"# edx = Sigma1(e)\n\t" \
853	"xorl %%edi, %%edx\n\t" \
854	"# esi = (f ^ g) & e\n\t" \
855	"andl %"#e", %%esi\n\t" \
856	"# esi = Ch(e,f,g)\n\t" \
857	"xorl %"#g", %%esi\n\t"
858
859	#define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i) \
860	"# h += w_k\n\t" \
861	"leaq %[W_K], %%r8\n\t" \
862	"addl ("#i")*4(%%r8), %"#h"\n\t" \
863	"# h = h + w_k + Sigma1(e)\n\t" \
864	"addl %%edx, %"#h"\n\t" \
865	"# r8d = a>>>2\n\t" \
866	"rorx $2, %"#a", %%r8d\n\t" \
867	"# edi = a>>>13\n\t" \
868	"rorx $13, %"#a", %%edi\n\t"
869
870	#define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i) \
871	"# edx = a>>22\n\t" \
872	"rorx $22, %"#a", %%edx\n\t" \
873	"# edi = (a>>>2) ^ (a>>>13)\n\t" \
874	"xorl %%r8d, %%edi\n\t" \
875	"# edx = Sigma0(a)\n\t" \
876	"xorl %%edi, %%edx\n\t"
877
878	#define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i) \
879	"# edi = b\n\t" \
880	"movl %"#b", %%edi\n\t" \
881	"# edi = a \| b\n\t" \
882	"orl %"#a", %%edi\n\t" \
883	"# edi = (a \| b) & c\n\t" \
884	"andl %"#c", %%edi\n\t" \
885	"# r8d = b\n\t" \
886	"movl %"#b", %%r8d\n\t"
887
888	#define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i) \
889	"# h += Ch(e,f,g)\n\t" \
890	"addl %%esi, %"#h"\n\t" \
891	"# r8d = b & a\n\t" \
892	"andl %"#a", %%r8d\n\t" \
893	"# r8d = Maj(a,b,c)\n\t" \
894	"orl %%edi, %%r8d\n\t"
895
896	#define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i) \
897	"# d += h + w_k + Sigma1(e) + Ch(e,f,g)\n\t" \
898	"addl %"#h", %"#d"\n\t" \
899	"addl %"#h", %%r8d\n\t" \
900	"addl %%edx, %%r8d\n\t" \
901	"movl %%r8d, %"#h"\n\t"
902	#endif /* HAVE_INTEL_RORX */
903
904	#define RND_STEP_1(a,b,c,d,e,f,g,h,i) \
905	"movl %"#e", %%edx\n\t" \
906	"# edx = e>>>6\n\t" \
907	"roll $26, %%edx\n\t" \
908	"movl %"#e", %%edi\n\t"
909
910	#define RND_STEP_2(a,b,c,d,e,f,g,h,i) \
911	"# edi = e>>>11\n\t" \
912	"roll $21, %%edi\n\t" \
913	"# edi = (e>>11) ^ (e>>6)\n\t" \
914	"xorl %%edx, %%edi\n\t" \
915	"# edx = e\n\t" \
916	"movl %"#e", %%edx\n\t" \
917	"# edx = e>>>25\n\t" \
918	"roll $7, %%edx\n\t"
919
920	#define RND_STEP_3(a,b,c,d,e,f,g,h,i) \
921	"# esi = f\n\t" \
922	"movl %"#f", %%esi\n\t" \
923	"# esi = f ^ g\n\t" \
924	"xorl %"#g", %%esi\n\t" \
925	"# edx = Sigma1(e)\n\t" \
926	"xorl %%edi, %%edx\n\t" \
927	"# esi = (f ^ g) & e\n\t" \
928	"andl %"#e", %%esi\n\t" \
929	"# esi = Ch(e,f,g)\n\t" \
930	"xorl %"#g", %%esi\n\t"
931
932	#define RND_STEP_4(a,b,c,d,e,f,g,h,i) \
933	"# h += w_k\n\t" \
934	"leaq %[W_K], %%r8\n\t" \
935	"addl ("#i")*4(%%r8), %"#h"\n\t" \
936	"# h = h + w_k + Sigma1(e)\n\t" \
937	"addl %%edx, %"#h"\n\t" \
938	"# r8d = a\n\t" \
939	"movl %"#a", %%r8d\n\t" \
940	"# r8d = a>>>2\n\t" \
941	"roll $30, %%r8d\n\t" \
942	"# edi = a\n\t" \
943	"movl %"#a", %%edi\n\t" \
944	"# edi = a>>>13\n\t" \
945	"roll $19, %%edi\n\t" \
946	"# edx = a\n\t" \
947	"movl %"#a", %%edx\n\t"
948
949	#define RND_STEP_5(a,b,c,d,e,f,g,h,i) \
950	"# edx = a>>>22\n\t" \
951	"roll $10, %%edx\n\t" \
952	"# edi = (a>>>2) ^ (a>>>13)\n\t" \
953	"xorl %%r8d, %%edi\n\t" \
954	"# edx = Sigma0(a)\n\t" \
955	"xorl %%edi, %%edx\n\t"
956
957	#define RND_STEP_6(a,b,c,d,e,f,g,h,i) \
958	"# edi = b\n\t" \
959	"movl %"#b", %%edi\n\t" \
960	"# edi = a \| b\n\t" \
961	"orl %"#a", %%edi\n\t" \
962	"# edi = (a \| b) & c\n\t" \
963	"andl %"#c", %%edi\n\t" \
964	"# r8d = b\n\t" \
965	"movl %"#b", %%r8d\n\t"
966
967	#define RND_STEP_7(a,b,c,d,e,f,g,h,i) \
968	"# h += Ch(e,f,g)\n\t" \
969	"addl %%esi, %"#h"\n\t" \
970	"#r8d = b & a\n\t" \
971	"andl %"#a", %%r8d\n\t" \
972	"# r8d = Maj(a,b,c)\n\t" \
973	"orl %%edi, %%r8d\n\t"
974
975	#define RND_STEP_8(a,b,c,d,e,f,g,h,i) \
976	"# d += h + w_k + Sigma1(e) + Ch(e,f,g)\n\t" \
977	"addl %"#h", %"#d"\n\t" \
978	"# r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c)\n\t" \
979	"addl %"#h", %%r8d\n\t" \
980	"# r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)\n\t" \
981	"addl %%edx, %%r8d\n\t" \
982	"# h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)\n\t" \
983	"movl %%r8d, %"#h"\n\t"
984
985	#define RND_X(a,b,c,d,e,f,g,h,i) \
986	RND_STEP_1(a,b,c,d,e,f,g,h,i) \
987	RND_STEP_2(a,b,c,d,e,f,g,h,i) \
988	RND_STEP_3(a,b,c,d,e,f,g,h,i) \
989	RND_STEP_4(a,b,c,d,e,f,g,h,i) \
990	RND_STEP_5(a,b,c,d,e,f,g,h,i) \
991	RND_STEP_6(a,b,c,d,e,f,g,h,i) \
992	RND_STEP_7(a,b,c,d,e,f,g,h,i) \
993	RND_STEP_8(a,b,c,d,e,f,g,h,i)
994
995	#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
996	#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
997	#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
998	#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
999	#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1000	#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1001	#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1002	#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1003
1004
1005	#define RND_1_3(a,b,c,d,e,f,g,h,i) \
1006	RND_STEP_1(a,b,c,d,e,f,g,h,i) \
1007	RND_STEP_2(a,b,c,d,e,f,g,h,i) \
1008	RND_STEP_3(a,b,c,d,e,f,g,h,i)
1009
1010	#define RND_4_6(a,b,c,d,e,f,g,h,i) \
1011	RND_STEP_4(a,b,c,d,e,f,g,h,i) \
1012	RND_STEP_5(a,b,c,d,e,f,g,h,i) \
1013	RND_STEP_6(a,b,c,d,e,f,g,h,i)
1014
1015	#define RND_7_8(a,b,c,d,e,f,g,h,i) \
1016	RND_STEP_7(a,b,c,d,e,f,g,h,i) \
1017	RND_STEP_8(a,b,c,d,e,f,g,h,i)
1018
1019	#define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
1020	#define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
1021	#define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
1022	#define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
1023	#define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1024	#define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1025	#define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1026	#define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1027
1028
1029	#define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
1030	#define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
1031	#define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
1032	#define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
1033	#define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1034	#define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1035	#define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1036	#define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1037
1038	#define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
1039	#define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
1040	#define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
1041	#define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
1042	#define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1043	#define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1044	#define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1045	#define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1046
1047	#define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i)
1048	#define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i)
1049	#define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i)
1050	#define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i)
1051	#define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i)
1052	#define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i)
1053	#define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i)
1054	#define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i)
1055
1056	#define FOR(cnt, init, max, inc, loop) \
1057	__asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):)
1058	#define END(cnt, init, max, inc, loop) \
1059	__asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::);
1060
1061	#endif /* defined(HAVE_INTEL_AVX1) \|\| defined(HAVE_INTEL_AVX2) */
1062
1063	#if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
1064
1065	#define VPALIGNR(op1,op2,op3,op4) \
1066	"vpalignr $"#op4", %"#op3", %"#op2", %"#op1"\n\t"
1067	#define VPADDD(op1,op2,op3) \
1068	"vpaddd %"#op3", %"#op2", %"#op1"\n\t"
1069	#define VPSRLD(op1,op2,op3) \
1070	"vpsrld $"#op3", %"#op2", %"#op1"\n\t"
1071	#define VPSRLQ(op1,op2,op3) \
1072	"vpsrlq $"#op3", %"#op2", %"#op1"\n\t"
1073	#define VPSLLD(op1,op2,op3) \
1074	"vpslld $"#op3", %"#op2", %"#op1"\n\t"
1075	#define VPOR(op1,op2,op3) \
1076	"vpor %"#op3", %"#op2", %"#op1"\n\t"
1077	#define VPXOR(op1,op2,op3) \
1078	"vpxor %"#op3", %"#op2", %"#op1"\n\t"
1079	#define VPSHUFD(op1,op2,op3) \
1080	"vpshufd $"#op3", %"#op2", %"#op1"\n\t"
1081	#define VPSHUFB(op1,op2,op3) \
1082	"vpshufb %"#op3", %"#op2", %"#op1"\n\t"
1083
1084	#define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\
1085	a,b,c,d,e,f,g,h,_i)\
1086	RND_STEP_1(a,b,c,d,e,f,g,h,_i)\
1087	VPALIGNR (XTMP0, X3, X2, 4)\
1088	RND_STEP_2(a,b,c,d,e,f,g,h,_i)\
1089	VPADDD (XTMP0, XTMP0, X0)\
1090	RND_STEP_3(a,b,c,d,e,f,g,h,_i)\
1091	VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\
1092	RND_STEP_4(a,b,c,d,e,f,g,h,_i)\
1093	VPSRLD (XTMP2, XTMP1, 7)\
1094	RND_STEP_5(a,b,c,d,e,f,g,h,_i)\
1095	VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
1096	RND_STEP_6(a,b,c,d,e,f,g,h,_i)\
1097	VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\
1098	RND_STEP_7(a,b,c,d,e,f,g,h,_i)\
1099	VPSRLD (XTMP2, XTMP1,18)\
1100	RND_STEP_8(a,b,c,d,e,f,g,h,_i)\
1101	\
1102	RND_STEP_1(h,a,b,c,d,e,f,g,_i+1)\
1103	VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */\
1104	RND_STEP_2(h,a,b,c,d,e,f,g,_i+1)\
1105	VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
1106	RND_STEP_3(h,a,b,c,d,e,f,g,_i+1)\
1107	VPXOR (XTMP3, XTMP3, XTMP1)\
1108	RND_STEP_4(h,a,b,c,d,e,f,g,_i+1)\
1109	VPXOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
1110	RND_STEP_5(h,a,b,c,d,e,f,g,_i+1)\
1111	VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */\
1112	RND_STEP_6(h,a,b,c,d,e,f,g,_i+1)\
1113	VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\
1114	RND_STEP_7(h,a,b,c,d,e,f,g,_i+1)\
1115	VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */\
1116	RND_STEP_8(h,a,b,c,d,e,f,g,_i+1)\
1117	\
1118	RND_STEP_1(g,h,a,b,c,d,e,f,_i+2)\
1119	VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\
1120	RND_STEP_2(g,h,a,b,c,d,e,f,_i+2)\
1121	VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
1122	RND_STEP_3(g,h,a,b,c,d,e,f,_i+2)\
1123	VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
1124	RND_STEP_4(g,h,a,b,c,d,e,f,_i+2)\
1125	VPXOR (XTMP2, XTMP2, XTMP3)\
1126	RND_STEP_5(g,h,a,b,c,d,e,f,_i+2)\
1127	VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\
1128	RND_STEP_6(g,h,a,b,c,d,e,f,_i+2)\
1129	VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\
1130	RND_STEP_7(g,h,a,b,c,d,e,f,_i+2)\
1131	VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\
1132	RND_STEP_8(g,h,a,b,c,d,e,f,_i+2)\
1133	\
1134	RND_STEP_1(f,g,h,a,b,c,d,e,_i+3)\
1135	VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\
1136	RND_STEP_2(f,g,h,a,b,c,d,e,_i+3)\
1137	VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\
1138	RND_STEP_3(f,g,h,a,b,c,d,e,_i+3)\
1139	VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
1140	RND_STEP_4(f,g,h,a,b,c,d,e,_i+3)\
1141	VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
1142	RND_STEP_5(f,g,h,a,b,c,d,e,_i+3)\
1143	VPXOR (XTMP2, XTMP2, XTMP3)\
1144	RND_STEP_6(f,g,h,a,b,c,d,e,_i+3)\
1145	VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\
1146	RND_STEP_7(f,g,h,a,b,c,d,e,_i+3)\
1147	VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\
1148	RND_STEP_8(f,g,h,a,b,c,d,e,_i+3)\
1149	VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */\
1150
1151	#if defined(HAVE_INTEL_RORX)
1152
1153	#define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \
1154	XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\
1155	RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i)\
1156	VPALIGNR (XTMP0, X3, X2, 4)\
1157	RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i)\
1158	VPADDD (XTMP0, XTMP0, X0)\
1159	RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i)\
1160	VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */\
1161	RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i)\
1162	VPSRLD (XTMP2, XTMP1, 7)\
1163	RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i)\
1164	VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
1165	RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i)\
1166	VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */\
1167	RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i)\
1168	VPSRLD (XTMP2, XTMP1,18)\
1169	RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i)\
1170	\
1171	RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1)\
1172	VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */\
1173	RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1)\
1174	VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
1175	RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1)\
1176	VPXOR (XTMP3, XTMP3, XTMP1)\
1177	RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1)\
1178	VPXOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
1179	RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1)\
1180	VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */\
1181	RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1)\
1182	VPSHUFD(XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/\
1183	RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1)\
1184	VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */\
1185	RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1)\
1186	\
1187	RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2)\
1188	VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */\
1189	RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2)\
1190	VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
1191	RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2)\
1192	VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
1193	RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2)\
1194	VPXOR (XTMP2, XTMP2, XTMP3)\
1195	RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2)\
1196	VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */\
1197	RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2)\
1198	VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */\
1199	RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2)\
1200	VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */\
1201	RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2)\
1202	\
1203	RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3)\
1204	VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */\
1205	RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3)\
1206	VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */\
1207	RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3)\
1208	VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
1209	RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3)\
1210	VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
1211	RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3)\
1212	VPXOR (XTMP2, XTMP2, XTMP3)\
1213	RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3)\
1214	VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */\
1215	RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3)\
1216	VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */\
1217	RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3)\
1218	VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */\
1219
1220	#endif /* HAVE_INTEL_RORX */
1221
1222
1223	#define W_K_from_buff() \
1224	"leaq %[buf], %%r8\n\t" \
1225	"vmovdqu (%%r8), %%xmm4\n\t" \
1226	"vpshufb %%xmm13, %%xmm4, %%xmm4\n\t" \
1227	"vmovdqu 16(%%r8), %%xmm5\n\t" \
1228	"vpshufb %%xmm13, %%xmm5, %%xmm5\n\t" \
1229	"vmovdqu 32(%%r8), %%xmm6\n\t" \
1230	"vpshufb %%xmm13, %%xmm6, %%xmm6\n\t" \
1231	"vmovdqu 48(%%r8), %%xmm7\n\t" \
1232	"vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"
1233
1234	#define _SET_W_K_XFER(reg, i) \
1235	"leaq %[K], %%r8\n\t" \
1236	"vpaddd ("#i")*4(%%r8), %"#reg", %%xmm9\n\t" \
1237	"leaq %[W_K], %%r8\n\t" \
1238	"vmovdqa %%xmm9, ("#i")*4(%%r8)\n\t"
1239
1240	#define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
1241
1242	static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */
1243	static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */
1244	static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b };
1245
1246
1247	#define _Init_Masks(mask1, mask2, mask3) \
1248	"vmovdqu %[FLIP], %"#mask1"\n\t" \
1249	"vmovdqu %[SHUF00BA], %"#mask2"\n\t" \
1250	"vmovdqu %[SHUFDC00], %"#mask3"\n\t"
1251
1252	#define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
1253	_Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1254
1255	#define X0 %xmm4
1256	#define X1 %xmm5
1257	#define X2 %xmm6
1258	#define X3 %xmm7
1259	#define X_ X0
1260
1261	#define XTMP0 %xmm0
1262	#define XTMP1 %xmm1
1263	#define XTMP2 %xmm2
1264	#define XTMP3 %xmm3
1265	#define XTMP4 %xmm8
1266	#define XTMP5 %xmm9
1267	#define XFER %xmm10
1268
1269	#define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */
1270	#define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */
1271	#define BYTE_FLIP_MASK %xmm13
1272
1273
1274	static int Transform_AVX1(wc_Sha256* sha256)
1275	{
1276	ALIGN32 word32 W_K[64]; /* temp for W+K */
1277
1278	__asm__ __volatile__ (
1279
1280	Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1281	"# X0, X1, X2, X3 = W[0..15]; \n\t"
1282	W_K_from_buff()
1283
1284	DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1285
1286	SET_W_K_XFER(X0, 0)
1287	MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1288	SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0)
1289	SET_W_K_XFER(X1, 4)
1290	MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1291	SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4)
1292	SET_W_K_XFER(X2, 8)
1293	MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1294	SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1295	SET_W_K_XFER(X3, 12)
1296	MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1297	SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12)
1298	SET_W_K_XFER(X0, 16)
1299	MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1300	SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1301	SET_W_K_XFER(X1, 20)
1302	MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1303	SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20)
1304	SET_W_K_XFER(X2, 24)
1305	MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1306	SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1307	SET_W_K_XFER(X3, 28)
1308	MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1309	SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28)
1310	SET_W_K_XFER(X0, 32)
1311	MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1312	SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1313	SET_W_K_XFER(X1, 36)
1314	MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1315	SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36)
1316	SET_W_K_XFER(X2, 40)
1317	MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1318	SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1319	SET_W_K_XFER(X3, 44)
1320	MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1321	SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44)
1322
1323	SET_W_K_XFER(X0, 48)
1324	SET_W_K_XFER(X1, 52)
1325	SET_W_K_XFER(X2, 56)
1326	SET_W_K_XFER(X3, 60)
1327
1328	RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1329	RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1330	RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1331	RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1332
1333	RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1334	RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1335	RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1336	RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1337
1338	RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56)
1339	RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57)
1340	RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58)
1341	RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59)
1342
1343	RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60)
1344	RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61)
1345	RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62)
1346	RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63)
1347
1348	RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1349
1350	:
1351	: [FLIP] "m" (mBYTE_FLIP_MASK[0]),
1352	[SHUF00BA] "m" (mSHUF_00BA[0]),
1353	[SHUFDC00] "m" (mSHUF_DC00[0]),
1354	[digest] "m" (sha256->digest),
1355	[buf] "m" (sha256->buffer),
1356	[K] "m" (K),
1357	[W_K] "m" (W_K)
1358	: SSE_REGs, "memory"
1359	);
1360
1361	return 0;
1362	}
1363
1364	#if defined(HAVE_INTEL_RORX)
1365	static int Transform_AVX1_RORX(wc_Sha256* sha256)
1366	{
1367	ALIGN32 word32 W_K[64]; /* temp for W+K */
1368
1369	__asm__ __volatile__ (
1370
1371	Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
1372	"# X0, X1, X2, X3 = W[0..15]; \n\t"
1373	W_K_from_buff()
1374
1375	DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1376
1377	SET_W_K_XFER(X0, 0)
1378	MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
1379	SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0)
1380	SET_W_K_XFER(X1, 4)
1381	MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1382	XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4)
1383	SET_W_K_XFER(X2, 8)
1384	MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1385	XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1386	SET_W_K_XFER(X3, 12)
1387	MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1388	XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12)
1389	SET_W_K_XFER(X0, 16)
1390	MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1391	XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1392	SET_W_K_XFER(X1, 20)
1393	MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1394	XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20)
1395	SET_W_K_XFER(X2, 24)
1396	MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1397	XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1398	SET_W_K_XFER(X3, 28)
1399	MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1400	XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28)
1401	SET_W_K_XFER(X0, 32)
1402	MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1403	XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1404	SET_W_K_XFER(X1, 36)
1405	MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1406	XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36)
1407	SET_W_K_XFER(X2, 40)
1408	MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1409	XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1410	SET_W_K_XFER(X3, 44)
1411	MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
1412	XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44)
1413
1414	SET_W_K_XFER(X0, 48)
1415	SET_W_K_XFER(X1, 52)
1416	SET_W_K_XFER(X2, 56)
1417	SET_W_K_XFER(X3, 60)
1418
1419	RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1420	RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1421	RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1422	RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1423
1424	RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1425	RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1426	RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1427	RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1428
1429	RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56)
1430	RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57)
1431	RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58)
1432	RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59)
1433
1434	RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60)
1435	RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61)
1436	RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62)
1437	RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63)
1438
1439	RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1440
1441	:
1442	: [FLIP] "m" (mBYTE_FLIP_MASK[0]),
1443	[SHUF00BA] "m" (mSHUF_00BA[0]),
1444	[SHUFDC00] "m" (mSHUF_DC00[0]),
1445	[digest] "m" (sha256->digest),
1446	[buf] "m" (sha256->buffer),
1447	[K] "m" (K),
1448	[W_K] "m" (W_K)
1449	: SSE_REGs, "memory"
1450	);
1451
1452	return 0;
1453	}
1454	#endif /* HAVE_INTEL_RORX */
1455	#endif /* HAVE_INTEL_AVX1 */
1456
1457
1458	#if defined(HAVE_INTEL_AVX2)
1459
1460	#define _MOVE_to_REG(ymm, mem, i) \
1461	"leaq %["#mem"], %%r8\n\t" \
1462	"vmovdqu ("#i")*4(%%r8), %%"#ymm"\n\t"
1463	#define _MOVE_to_MEM(mem, i, ymm) \
1464	"leaq %["#mem"], %%r8\n\t" \
1465	"vmovdqu %%"#ymm", "#i"*4(%%r8)\n\t"
1466	#define _BYTE_SWAP(ymm, map) \
1467	"vpshufb %["#map"], %%"#ymm", %%"#ymm"\n\t"
1468	#define _MOVE_128(ymm0, ymm1, ymm2, map) \
1469	"vperm2i128 $"#map", %%"#ymm2", %%"#ymm1", %%"#ymm0"\n\t"
1470	#define _MOVE_BYTE(ymm0, ymm1, map) \
1471	"vpshufb %["#map"], %%"#ymm1", %%"#ymm0"\n\t"
1472	#define _S_TEMP(dest, src, bits, temp) \
1473	"vpsrld $"#bits", %%"#src", %%"#dest"\n\t" \
1474	"vpslld $32-"#bits", %%"#src", %%"#temp"\n\t" \
1475	"vpor %%"#temp",%%"#dest", %%"#dest"\n\t"
1476	#define _AVX2_R(dest, src, bits) \
1477	"vpsrld $"#bits", %%"#src", %%"#dest"\n\t"
1478	#define _XOR(dest, src1, src2) \
1479	"vpxor %%"#src1", %%"#src2", %%"#dest"\n\t"
1480	#define _OR(dest, src1, src2) \
1481	"vpor %%"#src1", %%"#src2", %%"#dest"\n\t"
1482	#define _ADD(dest, src1, src2) \
1483	"vpaddd %%"#src1", %%"#src2", %%"#dest"\n\t"
1484	#define _ADD_MEM(dest, src1, mem, i) \
1485	"leaq %["#mem"], %%r8\n\t" \
1486	"vpaddd "#i"*4(%%r8), %%"#src1", %%"#dest"\n\t"
1487	#define _BLEND(map, dest, src1, src2) \
1488	"vpblendd $"#map", %%"#src1", %%"#src2", %%"#dest"\n\t"
1489
1490	#define _EXTRACT_XMM_0(xmm, mem) \
1491	"vpextrd $0, %%"#xmm", %["#mem"]\n\t"
1492	#define _EXTRACT_XMM_1(xmm, mem) \
1493	"vpextrd $1, %%"#xmm", %["#mem"]\n\t"
1494	#define _EXTRACT_XMM_2(xmm, mem) \
1495	"vpextrd $2, %%"#xmm", %["#mem"]\n\t"
1496	#define _EXTRACT_XMM_3(xmm, mem) \
1497	"vpextrd $3, %%"#xmm", %["#mem"]\n\t"
1498	#define _EXTRACT_XMM_4(ymm, xmm, mem) \
1499	"vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm"\n\t" \
1500	"vpextrd $0, %%"#xmm", %["#mem"]\n\t"
1501	#define _EXTRACT_XMM_5(xmm, mem) \
1502	"vpextrd $1, %%"#xmm", %["#mem"]\n\t"
1503	#define _EXTRACT_XMM_6(xmm, mem) \
1504	"vpextrd $2, %%"#xmm", %["#mem"]\n\t"
1505	#define _EXTRACT_XMM_7(xmm, mem) \
1506	"vpextrd $3, %%"#xmm", %["#mem"]\n\t"
1507
1508	#define _SWAP_YMM_HL(ymm) \
1509	"vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm"\n\t"
1510	#define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm)
1511
1512	#define MOVE_to_REG(ymm, mem, i) _MOVE_to_REG(ymm, mem, i)
1513	#define MOVE_to_MEM(mem, i, ymm) _MOVE_to_MEM(mem, i, ymm)
1514	#define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map)
1515	#define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map)
1516	#define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map)
1517	#define XOR(dest, src1, src2) _XOR(dest, src1, src2)
1518	#define OR(dest, src1, src2) _OR(dest, src1, src2)
1519	#define ADD(dest, src1, src2) _ADD(dest, src1, src2)
1520	#define ADD_MEM(dest, src1, mem, i) _ADD_MEM(dest, src1, mem, i)
1521	#define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2)
1522
1523	#define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp)
1524	#define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP)
1525	#define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits)
1526
1527	#define GAMMA0(dest, src) AVX2_S(dest, src, 7) AVX2_S(G_TEMP, src, 18) \
1528	XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 3) XOR(dest, G_TEMP, dest)
1529	#define GAMMA0_1(dest, src) AVX2_S(dest, src, 7) AVX2_S(G_TEMP, src, 18)
1530	#define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 3) \
1531	XOR(dest, G_TEMP, dest)
1532
1533	#define GAMMA1(dest, src) AVX2_S(dest, src, 17) AVX2_S(G_TEMP, src, 19) \
1534	XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 10) XOR(dest, G_TEMP, dest)
1535	#define GAMMA1_1(dest, src) AVX2_S(dest, src, 17) AVX2_S(G_TEMP, src, 19)
1536	#define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) AVX2_R(G_TEMP, src, 10) \
1537	XOR(dest, G_TEMP, dest)
1538
1539	#define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, MAP1W_2) \
1540	BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2)
1541	#define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) \
1542	MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, MAP2W_2) BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2)
1543	#define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, MAP3W_2) \
1544	BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2)
1545
1546	#define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08)\
1547	MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, MAPW_7) BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7)
1548
1549	#undef voitle
1550
1551	#define W_I_16 ymm8
1552	#define W_I_15 ymm9
1553	#define W_I_7 ymm10
1554	#define W_I_2 ymm11
1555	#define W_I ymm12
1556	#define G_TEMP ymm13
1557	#define S_TEMP ymm14
1558	#define YMM_TEMP0 ymm15
1559	#define YMM_TEMP0x xmm15
1560	#define W_I_TEMP ymm7
1561	#define W_K_TEMP ymm15
1562	#define W_K_TEMPx xmm15
1563
1564	#define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
1565	"vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15"\n\t" \
1566	"vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16"\n\t" \
1567	"vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15"\n\t" \
1568	"vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16"\n\t" \
1569	"vpshufd $0x93, %%"#w_i_16", %%"#w_i_16"\n\t"
1570
1571	#define MOVE_7_to_15(w_i_15, w_i_7)\
1572	"vmovdqu %%"#w_i_7", %%"#w_i_15"\n\t"
1573
1574	#define MOVE_I_to_7(w_i_7, w_i)\
1575	"vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7"\n\t" \
1576	"vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7"\n\t" \
1577	"vpshufd $0x39, %%"#w_i_7", %%"#w_i_7"\n\t"
1578
1579	#define MOVE_I_to_2(w_i_2, w_i)\
1580	"vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2"\n\t" \
1581	"vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2"\n\t"
1582
1583	#define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\
1584	MOVE_15_to_16(w_i_16, w_i_15, w_i_7) \
1585	MOVE_7_to_15(w_i_15, w_i_7) \
1586	MOVE_I_to_7(w_i_7, w_i) \
1587	MOVE_I_to_2(w_i_2, w_i)
1588
1589	#define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1590	{ word32 d[8];\
1591	__asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs);\
1592	__asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs);\
1593	__asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs);\
1594	__asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs);\
1595	__asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs);\
1596	__asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs);\
1597	__asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs);\
1598	__asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs);\
1599	printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\
1600	__asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs);\
1601	__asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs);\
1602	__asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs);\
1603	__asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs);\
1604	__asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs);\
1605	__asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs);\
1606	__asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs);\
1607	__asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs);\
1608	}
1609
1610
1611	#define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1612	_DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1613
1614	#define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1615	_RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1616
1617	#define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
1618	_DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
1619
1620
1621	/* Byte swap Masks to ensure that rest of the words are filled with zero's. */
1622	static const unsigned long mBYTE_FLIP_MASK_16[] =
1623	{ 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b };
1624	static const unsigned long mBYTE_FLIP_MASK_15[] =
1625	{ 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b };
1626	static const unsigned long mBYTE_FLIP_MASK_7 [] =
1627	{ 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b };
1628	static const unsigned long mBYTE_FLIP_MASK_2 [] =
1629	{ 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 };
1630
1631	static const unsigned long mMAPtoW_I_7[] =
1632	{ 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 };
1633	static const unsigned long mMAP1toW_I_2[] =
1634	{ 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 };
1635	static const unsigned long mMAP2toW_I_2[] =
1636	{ 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 };
1637	static const unsigned long mMAP3toW_I_2[] =
1638	{ 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 };
1639
1640	static int Transform_AVX2(wc_Sha256* sha256)
1641	{
1642	#ifdef WOLFSSL_SMALL_STACK
1643	word32* W_K;
1644	W_K = (word32) XMALLOC(sizeof(word32) 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
1645	if (W_K == NULL)
1646	return MEMORY_E;
1647	#else
1648	word32 W_K[64];
1649	#endif
1650
1651	__asm__ __volatile__ (
1652
1653	MOVE_to_REG(W_I_16, buf, 0) BYTE_SWAP(W_I_16, FLIP_16)
1654	MOVE_to_REG(W_I_15, buf, 1) BYTE_SWAP(W_I_15, FLIP_15)
1655	MOVE_to_REG(W_I, buf, 8) BYTE_SWAP(W_I, FLIP_16)
1656	MOVE_to_REG(W_I_7, buf, 16-7) BYTE_SWAP(W_I_7, FLIP_7)
1657	MOVE_to_REG(W_I_2, buf, 16-2) BYTE_SWAP(W_I_2, FLIP_2)
1658
1659	DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1660
1661	ADD_MEM(W_K_TEMP, W_I_16, K, 0)
1662	MOVE_to_MEM(W_K, 0, W_K_TEMP)
1663
1664	RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0)
1665	RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1)
1666	RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2)
1667	RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3)
1668	RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4)
1669	RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5)
1670	RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6)
1671	RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7)
1672
1673	ADD_MEM(YMM_TEMP0, W_I, K, 8)
1674	MOVE_to_MEM(W_K, 8, YMM_TEMP0)
1675
1676	/* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1677	RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1678	GAMMA0_1(W_I_TEMP, W_I_15)
1679	RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1680	GAMMA0_2(W_I_TEMP, W_I_15)
1681	RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8)
1682	ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1683	RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9)
1684	ADD(W_I, W_I_7, W_I_TEMP)
1685	RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9)
1686	GAMMA1_1(YMM_TEMP0, W_I_2)
1687	RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9)
1688	GAMMA1_2(YMM_TEMP0, W_I_2)
1689	RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10)
1690	ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1691	RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10)
1692	FEEDBACK1_to_W_I_2
1693	RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10)
1694	FEEDBACK_to_W_I_7
1695	RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11)
1696	ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1697	RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11)
1698	GAMMA1_1(YMM_TEMP0, W_I_2)
1699	RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11)
1700	GAMMA1_2(YMM_TEMP0, W_I_2)
1701	RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12)
1702	ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1703	RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12)
1704	FEEDBACK2_to_W_I_2
1705	RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12)
1706	GAMMA1_1(YMM_TEMP0, W_I_2)
1707	RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13)
1708	GAMMA1_2(YMM_TEMP0, W_I_2)
1709	RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13)
1710	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1711	RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13)
1712	FEEDBACK3_to_W_I_2
1713	RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14)
1714	GAMMA1(YMM_TEMP0, W_I_2)
1715	RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14)
1716	RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14)
1717	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1718	RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15)
1719
1720	MOVE_to_REG(YMM_TEMP0, K, 16)
1721	RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15)
1722	ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1723	RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15)
1724	ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1725	MOVE_to_MEM(W_K, 16, YMM_TEMP0)
1726
1727	/* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1728	RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1729	GAMMA0_1(W_I_TEMP, W_I_15)
1730	RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1731	GAMMA0_2(W_I_TEMP, W_I_15)
1732	RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16)
1733	ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1734	RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17)
1735	ADD(W_I, W_I_7, W_I_TEMP)
1736	RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17)
1737	GAMMA1_1(YMM_TEMP0, W_I_2)
1738	RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17)
1739	GAMMA1_2(YMM_TEMP0, W_I_2)
1740	RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18)
1741	ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1742	RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18)
1743	FEEDBACK1_to_W_I_2
1744	RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18)
1745	FEEDBACK_to_W_I_7
1746	RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19)
1747	ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1748	RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19)
1749	GAMMA1(YMM_TEMP0, W_I_2)
1750	RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19)
1751	GAMMA1_2(YMM_TEMP0, W_I_2)
1752	RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20)
1753	ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1754	RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20)
1755	FEEDBACK2_to_W_I_2
1756	RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20)
1757	GAMMA1_1(YMM_TEMP0, W_I_2)
1758	RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21)
1759	GAMMA1_2(YMM_TEMP0, W_I_2)
1760	RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21)
1761	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1762	RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21)
1763	FEEDBACK3_to_W_I_2
1764	RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22)
1765	GAMMA1_1(YMM_TEMP0, W_I_2)
1766	RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22)
1767	GAMMA1_2(YMM_TEMP0, W_I_2)
1768	RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22)
1769	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1770	RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23)
1771
1772	MOVE_to_REG(YMM_TEMP0, K, 24)
1773	RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23)
1774	ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1775	RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23)
1776	ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1777	MOVE_to_MEM(W_K, 24, YMM_TEMP0)
1778
1779	/* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1780	RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1781	GAMMA0_1(W_I_TEMP, W_I_15)
1782	RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1783	GAMMA0_2(W_I_TEMP, W_I_15)
1784	RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24)
1785	ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1786	RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25)
1787	ADD(W_I, W_I_7, W_I_TEMP)
1788	RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25)
1789	GAMMA1_1(YMM_TEMP0, W_I_2)
1790	RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25)
1791	GAMMA1_2(YMM_TEMP0, W_I_2)
1792	RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26)
1793	ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1794	RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26)
1795	FEEDBACK1_to_W_I_2
1796	RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26)
1797	FEEDBACK_to_W_I_7
1798	RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27)
1799	ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1800	RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27)
1801	GAMMA1_1(YMM_TEMP0, W_I_2)
1802	RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27)
1803	GAMMA1_2(YMM_TEMP0, W_I_2)
1804	RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28)
1805	ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1806	RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28)
1807	FEEDBACK2_to_W_I_2
1808	RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28)
1809	GAMMA1_1(YMM_TEMP0, W_I_2)
1810	RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29)
1811	GAMMA1_2(YMM_TEMP0, W_I_2)
1812	RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29)
1813	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1814	RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29)
1815	FEEDBACK3_to_W_I_2
1816	RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30)
1817	GAMMA1(YMM_TEMP0, W_I_2)
1818	RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30)
1819	RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30)
1820	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1821	RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31)
1822
1823	MOVE_to_REG(YMM_TEMP0, K, 32)
1824	RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31)
1825	ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1826	RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31)
1827	ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1828	MOVE_to_MEM(W_K, 32, YMM_TEMP0)
1829
1830
1831	/* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1832	RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1833	GAMMA0_1(W_I_TEMP, W_I_15)
1834	RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1835	GAMMA0_2(W_I_TEMP, W_I_15)
1836	RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32)
1837	ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1838	RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33)
1839	ADD(W_I, W_I_7, W_I_TEMP)
1840	RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33)
1841	GAMMA1_1(YMM_TEMP0, W_I_2)
1842	RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33)
1843	GAMMA1_2(YMM_TEMP0, W_I_2)
1844	RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34)
1845	ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1846	RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34)
1847	FEEDBACK1_to_W_I_2
1848	RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34)
1849	FEEDBACK_to_W_I_7
1850	RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35)
1851	ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1852	RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35)
1853	GAMMA1_1(YMM_TEMP0, W_I_2)
1854	RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35)
1855	GAMMA1_2(YMM_TEMP0, W_I_2)
1856	RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36)
1857	ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1858	RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36)
1859	FEEDBACK2_to_W_I_2
1860	RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36)
1861	GAMMA1_1(YMM_TEMP0, W_I_2)
1862	RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37)
1863	GAMMA1_2(YMM_TEMP0, W_I_2)
1864	RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37)
1865	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1866	RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37)
1867	FEEDBACK3_to_W_I_2
1868	RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38)
1869	GAMMA1_1(YMM_TEMP0, W_I_2)
1870	RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38)
1871	GAMMA1_2(YMM_TEMP0, W_I_2)
1872	RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38)
1873	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1874	RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39)
1875
1876	MOVE_to_REG(YMM_TEMP0, K, 40)
1877	RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39)
1878	ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1879	RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39)
1880	ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1881	MOVE_to_MEM(W_K, 40, YMM_TEMP0)
1882
1883	/* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1884	RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1885	GAMMA0_1(W_I_TEMP, W_I_15)
1886	RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1887	GAMMA0_2(W_I_TEMP, W_I_15)
1888	RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40)
1889	ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1890	RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41)
1891	ADD(W_I, W_I_7, W_I_TEMP)
1892	RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41)
1893	GAMMA1_1(YMM_TEMP0, W_I_2)
1894	RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41)
1895	GAMMA1_2(YMM_TEMP0, W_I_2)
1896	RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42)
1897	ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1898	RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42)
1899	FEEDBACK1_to_W_I_2
1900	RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42)
1901	FEEDBACK_to_W_I_7
1902	RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43)
1903	ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1904	RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43)
1905	GAMMA1_1(YMM_TEMP0, W_I_2)
1906	RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43)
1907	GAMMA1_2(YMM_TEMP0, W_I_2)
1908	RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44)
1909	ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1910	RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44)
1911	FEEDBACK2_to_W_I_2
1912	RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44)
1913	GAMMA1_1(YMM_TEMP0, W_I_2)
1914	RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45)
1915	GAMMA1_2(YMM_TEMP0, W_I_2)
1916	RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45)
1917	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1918	RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45)
1919	FEEDBACK3_to_W_I_2
1920	RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46)
1921	GAMMA1_1(YMM_TEMP0, W_I_2)
1922	RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46)
1923	GAMMA1_2(YMM_TEMP0, W_I_2)
1924	RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46)
1925	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1926	RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47)
1927
1928	MOVE_to_REG(YMM_TEMP0, K, 48)
1929	RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47)
1930	ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1931	RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47)
1932	ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1933	MOVE_to_MEM(W_K, 48, YMM_TEMP0)
1934
1935	/* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
1936	RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1937	GAMMA0_1(W_I_TEMP, W_I_15)
1938	RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1939	GAMMA0_2(W_I_TEMP, W_I_15)
1940	RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48)
1941	ADD(W_I_TEMP, W_I_16, W_I_TEMP)/* for saving W_I before adding incomplete W_I_7 */
1942	RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1943	ADD(W_I, W_I_7, W_I_TEMP)
1944	RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1945	GAMMA1_1(YMM_TEMP0, W_I_2)
1946	RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49)
1947	GAMMA1_2(YMM_TEMP0, W_I_2)
1948	RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1949	ADD(W_I, W_I, YMM_TEMP0)/* now W[16..17] are completed */
1950	RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1951	FEEDBACK1_to_W_I_2
1952	RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50)
1953	FEEDBACK_to_W_I_7
1954	RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1955	ADD(W_I_TEMP, W_I_7, W_I_TEMP)
1956	RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1957	GAMMA1_1(YMM_TEMP0, W_I_2)
1958	RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51)
1959	GAMMA1_2(YMM_TEMP0, W_I_2)
1960	RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1961	ADD(W_I, W_I_TEMP, YMM_TEMP0)/* now W[16..19] are completed */
1962	RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1963	FEEDBACK2_to_W_I_2
1964	RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52)
1965	GAMMA1_1(YMM_TEMP0, W_I_2)
1966	RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1967	GAMMA1_2(YMM_TEMP0, W_I_2)
1968	RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1969	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..21] are completed */
1970	RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53)
1971	FEEDBACK3_to_W_I_2
1972	RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1973	GAMMA1_1(YMM_TEMP0, W_I_2)
1974	RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1975	GAMMA1_2(YMM_TEMP0, W_I_2)
1976	RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54)
1977	ADD(W_I, W_I_TEMP, YMM_TEMP0) /* now W[16..23] are completed */
1978	RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1979
1980	MOVE_to_REG(YMM_TEMP0, K, 56)
1981	RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1982	ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I)
1983	RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55)
1984	ADD(YMM_TEMP0, YMM_TEMP0, W_I)
1985	MOVE_to_MEM(W_K, 56, YMM_TEMP0)
1986
1987	RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56)
1988	RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57)
1989	RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58)
1990	RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59)
1991
1992	RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60)
1993	RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61)
1994	RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62)
1995	RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63)
1996
1997	RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7)
1998
1999	:
2000	: [FLIP_16] "m" (mBYTE_FLIP_MASK_16[0]),
2001	[FLIP_15] "m" (mBYTE_FLIP_MASK_15[0]),
2002	[FLIP_7] "m" (mBYTE_FLIP_MASK_7[0]),
2003	[FLIP_2] "m" (mBYTE_FLIP_MASK_2),
2004	[MAPW_7] "m" (mMAPtoW_I_7[0]),
2005	[MAP1W_2] "m" (mMAP1toW_I_2[0]),
2006	[MAP2W_2] "m" (mMAP2toW_I_2[0]),
2007	[MAP3W_2] "m" (mMAP3toW_I_2[0]),
2008	[digest] "m" (sha256->digest),
2009	[buf] "m" (sha256->buffer),
2010	[K] "m" (K),
2011	[W_K] "m" (W_K)
2012	: SSE_REGs, "memory"
2013	);
2014
2015	#ifdef WOLFSSL_SMALL_STACK
2016	XFREE(W_K, NULL, DYNAMIC_TYPE_TMP_BUFFER);
2017	#endif
2018
2019	return 0;
2020	}
2021
2022	#endif /* HAVE_INTEL_AVX2 */
2023
2024
2025	#ifdef WOLFSSL_SHA224
2026
2027	#ifdef STM32_HASH
2028
2029	#define Sha256Update Sha224Update
2030	#define Sha256Final Sha224Final
2031
2032	/*
2033	* STM32F2/F4/F7 hardware SHA224 support through the HASH_* API's from the
2034	* Standard Peripheral Library or CubeMX (See note in README).
2035	*/
2036
2037	/* STM32 register size, bytes */
2038	#ifdef WOLFSSL_STM32_CUBEMX
2039	#define SHA224_REG_SIZE WC_SHA224_BLOCK_SIZE
2040	#else
2041	#define SHA224_REG_SIZE 4
2042	/* STM32 struct notes:
2043	* sha224->buffer = first 4 bytes used to hold partial block if needed
2044	* sha224->buffLen = num bytes currently stored in sha256->buffer
2045	* sha224->loLen = num bytes that have been written to STM32 FIFO
2046	*/
2047	#endif
2048	#define SHA224_HW_TIMEOUT 0xFF
2049
2050	static int InitSha224(wc_Sha224* sha224)
2051	{
2052	if (sha224 == NULL)
2053	return BAD_FUNC_ARG;
2054
2055	XMEMSET(sha224->buffer, 0, sizeof(sha224->buffer));
2056	sha224->buffLen = 0;
2057	sha224->loLen = 0;
2058	sha224->hiLen = 0;
2059
2060	/* initialize HASH peripheral */
2061	#ifdef WOLFSSL_STM32_CUBEMX
2062	HAL_HASH_DeInit(&sha224->hashHandle);
2063	sha224->hashHandle.Init.DataType = HASH_DATATYPE_8B;
2064	if (HAL_HASH_Init(&sha224->hashHandle) != HAL_OK) {
2065	return ASYNC_INIT_E;
2066	}
2067	/* required because Cube MX is not clearing algo bits */
2068	HASH->CR &= ~HASH_CR_ALGO;
2069	#else
2070	HASH_DeInit();
2071
2072	/* reset the hash control register */
2073	/* required because Cube MX is not clearing algo bits */
2074	HASH->CR &= ~ (HASH_CR_ALGO \| HASH_CR_DATATYPE \| HASH_CR_MODE);
2075
2076	/* configure algo used, algo mode, datatype */
2077	HASH->CR \|= (HASH_AlgoSelection_SHA224 \| HASH_AlgoMode_HASH
2078	\| HASH_DataType_8b);
2079
2080	/* reset HASH processor */
2081	HASH->CR \|= HASH_CR_INIT;
2082	#endif
2083
2084	return 0;
2085	}
2086
2087	static int Sha224Update(wc_Sha256* sha224, const byte* data, word32 len)
2088	{
2089	int ret = 0;
2090	byte* local;
2091
2092	/* do block size increments */
2093	local = (byte*)sha224->buffer;
2094
2095	/* check that internal buffLen is valid */
2096	if (sha224->buffLen >= SHA224_REG_SIZE)
2097	return BUFFER_E;
2098
2099	while (len) {
2100	word32 add = min(len, SHA224_REG_SIZE - sha224->buffLen);
2101	XMEMCPY(&local[sha224->buffLen], data, add);
2102
2103	sha224->buffLen += add;
2104	data += add;
2105	len -= add;
2106
2107	if (sha224->buffLen == SHA224_REG_SIZE) {
2108	#ifdef WOLFSSL_STM32_CUBEMX
2109	if (HAL_HASHEx_SHA224_Accumulate(
2110	&sha224->hashHandle, local, SHA224_REG_SIZE) != HAL_OK) {
2111	ret = ASYNC_OP_E;
2112	}
2113	#else
2114	HASH_DataIn((uint32_t)local);
2115	#endif
2116
2117	AddLength(sha224, SHA224_REG_SIZE);
2118	sha224->buffLen = 0;
2119	}
2120	}
2121	return ret;
2122	}
2123
2124	static int Sha224Final(wc_Sha256* sha224)
2125	{
2126	int ret = 0;
2127
2128	#ifdef WOLFSSL_STM32_CUBEMX
2129	if (HAL_HASHEx_SHA224_Start(&sha224->hashHandle,
2130	(byte*)sha224->buffer, sha224->buffLen,
2131	(byte*)sha224->digest, SHA224_HW_TIMEOUT) != HAL_OK) {
2132	ret = ASYNC_OP_E;
2133	}
2134	#else
2135	__IO uint16_t nbvalidbitsdata = 0;
2136
2137	/* finish reading any trailing bytes into FIFO */
2138	if (sha224->buffLen > 0) {
2139	HASH_DataIn((uint32_t)sha224->buffer);
2140	AddLength(sha224, sha224->buffLen);
2141	}
2142
2143	/* calculate number of valid bits in last word of input data */
2144	nbvalidbitsdata = 8 * (sha224->loLen % SHA224_REG_SIZE);
2145
2146	/* configure number of valid bits in last word of the data */
2147	HASH_SetLastWordValidBitsNbr(nbvalidbitsdata);
2148
2149	/* start HASH processor */
2150	HASH_StartDigest();
2151
2152	/* wait until Busy flag == RESET */
2153	while (HASH_GetFlagStatus(HASH_FLAG_BUSY) != RESET) {}
2154
2155	/* read message digest */
2156	sha224->digest[0] = HASH->HR[0];
2157	sha224->digest[1] = HASH->HR[1];
2158	sha224->digest[2] = HASH->HR[2];
2159	sha224->digest[3] = HASH->HR[3];
2160	sha224->digest[4] = HASH->HR[4];
2161	sha224->digest[5] = HASH_DIGEST->HR[5];
2162	sha224->digest[6] = HASH_DIGEST->HR[6];
2163
2164	ByteReverseWords(sha224->digest, sha224->digest, SHA224_DIGEST_SIZE);
2165	#endif /* WOLFSSL_STM32_CUBEMX */
2166
2167	return ret;
2168	}
2169
2170	#else
2171
2172	static int InitSha224(wc_Sha224* sha224)
2173	{
2174	int ret = 0;
2175
2176	if (sha224 == NULL) {
2177	return BAD_FUNC_ARG;
2178	}
2179
2180	sha224->digest[0] = 0xc1059ed8;
2181	sha224->digest[1] = 0x367cd507;
2182	sha224->digest[2] = 0x3070dd17;
2183	sha224->digest[3] = 0xf70e5939;
2184	sha224->digest[4] = 0xffc00b31;
2185	sha224->digest[5] = 0x68581511;
2186	sha224->digest[6] = 0x64f98fa7;
2187	sha224->digest[7] = 0xbefa4fa4;
2188
2189	sha224->buffLen = 0;
2190	sha224->loLen = 0;
2191	sha224->hiLen = 0;
2192
2193	#if defined(HAVE_INTEL_AVX1)\|\| defined(HAVE_INTEL_AVX2)
2194	/* choose best Transform function under this runtime environment */
2195	Sha256_SetTransform();
2196	#endif
2197
2198	return ret;
2199	}
2200
2201	#endif /* STM32_HASH */
2202
2203	int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId)
2204	{
2205	int ret = 0;
2206
2207	if (sha224 == NULL)
2208	return BAD_FUNC_ARG;
2209
2210	sha224->heap = heap;
2211
2212	ret = InitSha224(sha224);
2213	if (ret != 0)
2214	return ret;
2215
2216	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2217	ret = wolfAsync_DevCtxInit(&sha224->asyncDev,
2218	WOLFSSL_ASYNC_MARKER_SHA224, sha224->heap, devId);
2219	#else
2220	(void)devId;
2221	#endif /* WOLFSSL_ASYNC_CRYPT */
2222
2223	return ret;
2224	}
2225
2226	int wc_InitSha224(wc_Sha224* sha224)
2227	{
2228	return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);
2229	}
2230
2231	int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len)
2232	{
2233	int ret;
2234
2235	if (sha224 == NULL \|\| (data == NULL && len > 0)) {
2236	return BAD_FUNC_ARG;
2237	}
2238
2239	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2240	if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
2241	#if defined(HAVE_INTEL_QA)
2242	return IntelQaSymSha224(&sha224->asyncDev, NULL, data, len);
2243	#endif
2244	}
2245	#endif /* WOLFSSL_ASYNC_CRYPT */
2246
2247	ret = Sha256Update((wc_Sha256*)sha224, data, len);
2248
2249	return ret;
2250	}
2251
2252	int wc_Sha224Final(wc_Sha224* sha224, byte* hash)
2253	{
2254	int ret;
2255
2256	if (sha224 == NULL \|\| hash == NULL) {
2257	return BAD_FUNC_ARG;
2258	}
2259
2260	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2261	if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
2262	#if defined(HAVE_INTEL_QA)
2263	return IntelQaSymSha224(&sha224->asyncDev, hash, NULL,
2264	WC_SHA224_DIGEST_SIZE);
2265	#endif
2266	}
2267	#endif /* WOLFSSL_ASYNC_CRYPT */
2268
2269	ret = Sha256Final((wc_Sha256*)sha224);
2270	if (ret != 0)
2271	return ret;
2272
2273	#if defined(LITTLE_ENDIAN_ORDER) && !defined(STM32_HASH)
2274	ByteReverseWords(sha224->digest, sha224->digest, WC_SHA224_DIGEST_SIZE);
2275	#endif
2276	XMEMCPY(hash, sha224->digest, WC_SHA224_DIGEST_SIZE);
2277
2278	return InitSha224(sha224); /* reset state */
2279	}
2280
2281	void wc_Sha224Free(wc_Sha224* sha224)
2282	{
2283	if (sha224 == NULL)
2284	return;
2285
2286	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
2287	wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224);
2288	#endif /* WOLFSSL_ASYNC_CRYPT */
2289	}
2290
2291	#endif /* WOLFSSL_SHA224 */
2292
2293
2294	int wc_InitSha256(wc_Sha256* sha256)
2295	{
2296	return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
2297	}
2298
2299	void wc_Sha256Free(wc_Sha256* sha256)
2300	{
2301	if (sha256 == NULL)
2302	return;
2303
2304	#if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
2305	wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256);
2306	#endif /* WOLFSSL_ASYNC_CRYPT */
2307	}
2308
2309	#endif /* !WOLFSSL_TI_HASH */
2310	#endif /* HAVE_FIPS */
2311
2312
2313	#ifndef WOLFSSL_TI_HASH
2314	#ifdef WOLFSSL_SHA224
2315	int wc_Sha224GetHash(wc_Sha224* sha224, byte* hash)
2316	{
2317	int ret;
2318	wc_Sha224 tmpSha224;
2319
2320	if (sha224 == NULL \|\| hash == NULL)
2321	return BAD_FUNC_ARG;
2322
2323	ret = wc_Sha224Copy(sha224, &tmpSha224);
2324	if (ret == 0) {
2325	ret = wc_Sha224Final(&tmpSha224, hash);
2326	}
2327	return ret;
2328	}
2329	int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst)
2330	{
2331	int ret = 0;
2332
2333	if (src == NULL \|\| dst == NULL)
2334	return BAD_FUNC_ARG;
2335
2336	XMEMCPY(dst, src, sizeof(wc_Sha224));
2337
2338	#ifdef WOLFSSL_ASYNC_CRYPT
2339	ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
2340	#endif
2341
2342	return ret;
2343	}
2344	#endif /* WOLFSSL_SHA224 */
2345
2346	int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash)
2347	{
2348	int ret;
2349	wc_Sha256 tmpSha256;
2350
2351	if (sha256 == NULL \|\| hash == NULL)
2352	return BAD_FUNC_ARG;
2353
2354	ret = wc_Sha256Copy(sha256, &tmpSha256);
2355	if (ret == 0) {
2356	ret = wc_Sha256Final(&tmpSha256, hash);
2357	}
2358	return ret;
2359	}
2360	int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
2361	{
2362	int ret = 0;
2363
2364	if (src == NULL \|\| dst == NULL)
2365	return BAD_FUNC_ARG;
2366
2367	XMEMCPY(dst, src, sizeof(wc_Sha256));
2368
2369	#ifdef WOLFSSL_ASYNC_CRYPT
2370	ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
2371	#endif
2372	#ifdef WOLFSSL_PIC32MZ_HASH
2373	ret = wc_Pic32HashCopy(&src->cache, &dst->cache);
2374	#endif
2375
2376	return ret;
2377	}
2378	#endif /* !WOLFSSL_TI_HASH */
2379
2380	#endif /* NO_SHA256 */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: