Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

bn_asm.c@ 331

Last change on this file since 331 was 331, checked in by coas-nagasima, 6 years ago
prototoolに関連するプロジェクトをnewlibからmuslを使うよう変更・更新 ntshellをnewlibの下位の実装から、muslのsyscallの実装に変更・更新以下のOSSをアップデート・mruby-1.3.0 ・musl-1.1.18 ・onigmo-6.1.3 ・tcc-0.9.27 以下のOSSを追加・openssl-1.1.0e ・curl-7.57.0 ・zlib-1.2.11 以下のmrbgemsを追加・iij/mruby-digest ・iij/mruby-env ・iij/mruby-errno ・iij/mruby-iijson ・iij/mruby-ipaddr ・iij/mruby-mock ・iij/mruby-require ・iij/mruby-tls-openssl
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc`
File size: 26.9 KB

Line
1	/*
2	* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
3	*
4	* Licensed under the OpenSSL license (the "License"). You may not use
5	* this file except in compliance with the License. You can obtain a copy
6	* in the file LICENSE in the source distribution or at
7	* https://www.openssl.org/source/license.html
8	*/
9
10	#include <assert.h>
11	#include <openssl/crypto.h>
12	#include "internal/cryptlib.h"
13	#include "bn_lcl.h"
14
15	#if defined(BN_LLONG) \|\| defined(BN_UMULT_HIGH)
16
17	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
18	BN_ULONG w)
19	{
20	BN_ULONG c1 = 0;
21
22	assert(num >= 0);
23	if (num <= 0)
24	return (c1);
25
26	# ifndef OPENSSL_SMALL_FOOTPRINT
27	while (num & ~3) {
28	mul_add(rp[0], ap[0], w, c1);
29	mul_add(rp[1], ap[1], w, c1);
30	mul_add(rp[2], ap[2], w, c1);
31	mul_add(rp[3], ap[3], w, c1);
32	ap += 4;
33	rp += 4;
34	num -= 4;
35	}
36	# endif
37	while (num) {
38	mul_add(rp[0], ap[0], w, c1);
39	ap++;
40	rp++;
41	num--;
42	}
43
44	return (c1);
45	}
46
47	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
48	{
49	BN_ULONG c1 = 0;
50
51	assert(num >= 0);
52	if (num <= 0)
53	return (c1);
54
55	# ifndef OPENSSL_SMALL_FOOTPRINT
56	while (num & ~3) {
57	mul(rp[0], ap[0], w, c1);
58	mul(rp[1], ap[1], w, c1);
59	mul(rp[2], ap[2], w, c1);
60	mul(rp[3], ap[3], w, c1);
61	ap += 4;
62	rp += 4;
63	num -= 4;
64	}
65	# endif
66	while (num) {
67	mul(rp[0], ap[0], w, c1);
68	ap++;
69	rp++;
70	num--;
71	}
72	return (c1);
73	}
74
75	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
76	{
77	assert(n >= 0);
78	if (n <= 0)
79	return;
80
81	# ifndef OPENSSL_SMALL_FOOTPRINT
82	while (n & ~3) {
83	sqr(r[0], r[1], a[0]);
84	sqr(r[2], r[3], a[1]);
85	sqr(r[4], r[5], a[2]);
86	sqr(r[6], r[7], a[3]);
87	a += 4;
88	r += 8;
89	n -= 4;
90	}
91	# endif
92	while (n) {
93	sqr(r[0], r[1], a[0]);
94	a++;
95	r += 2;
96	n--;
97	}
98	}
99
100	#else /* !(defined(BN_LLONG) \|\|
101	* defined(BN_UMULT_HIGH)) */
102
103	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num,
104	BN_ULONG w)
105	{
106	BN_ULONG c = 0;
107	BN_ULONG bl, bh;
108
109	assert(num >= 0);
110	if (num <= 0)
111	return ((BN_ULONG)0);
112
113	bl = LBITS(w);
114	bh = HBITS(w);
115
116	# ifndef OPENSSL_SMALL_FOOTPRINT
117	while (num & ~3) {
118	mul_add(rp[0], ap[0], bl, bh, c);
119	mul_add(rp[1], ap[1], bl, bh, c);
120	mul_add(rp[2], ap[2], bl, bh, c);
121	mul_add(rp[3], ap[3], bl, bh, c);
122	ap += 4;
123	rp += 4;
124	num -= 4;
125	}
126	# endif
127	while (num) {
128	mul_add(rp[0], ap[0], bl, bh, c);
129	ap++;
130	rp++;
131	num--;
132	}
133	return (c);
134	}
135
136	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
137	{
138	BN_ULONG carry = 0;
139	BN_ULONG bl, bh;
140
141	assert(num >= 0);
142	if (num <= 0)
143	return ((BN_ULONG)0);
144
145	bl = LBITS(w);
146	bh = HBITS(w);
147
148	# ifndef OPENSSL_SMALL_FOOTPRINT
149	while (num & ~3) {
150	mul(rp[0], ap[0], bl, bh, carry);
151	mul(rp[1], ap[1], bl, bh, carry);
152	mul(rp[2], ap[2], bl, bh, carry);
153	mul(rp[3], ap[3], bl, bh, carry);
154	ap += 4;
155	rp += 4;
156	num -= 4;
157	}
158	# endif
159	while (num) {
160	mul(rp[0], ap[0], bl, bh, carry);
161	ap++;
162	rp++;
163	num--;
164	}
165	return (carry);
166	}
167
168	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
169	{
170	assert(n >= 0);
171	if (n <= 0)
172	return;
173
174	# ifndef OPENSSL_SMALL_FOOTPRINT
175	while (n & ~3) {
176	sqr64(r[0], r[1], a[0]);
177	sqr64(r[2], r[3], a[1]);
178	sqr64(r[4], r[5], a[2]);
179	sqr64(r[6], r[7], a[3]);
180	a += 4;
181	r += 8;
182	n -= 4;
183	}
184	# endif
185	while (n) {
186	sqr64(r[0], r[1], a[0]);
187	a++;
188	r += 2;
189	n--;
190	}
191	}
192
193	#endif /* !(defined(BN_LLONG) \|\|
194	* defined(BN_UMULT_HIGH)) */
195
196	#if defined(BN_LLONG) && defined(BN_DIV2W)
197
198	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199	{
200	return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) \| l) / (BN_ULLONG) d));
201	}
202
203	#else
204
205	/* Divide h,l by d and return the result. */
206	/* I need to test this some more :-( */
207	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
208	{
209	BN_ULONG dh, dl, q, ret = 0, th, tl, t;
210	int i, count = 2;
211
212	if (d == 0)
213	return (BN_MASK2);
214
215	i = BN_num_bits_word(d);
216	assert((i == BN_BITS2) \|\| (h <= (BN_ULONG)1 << i));
217
218	i = BN_BITS2 - i;
219	if (h >= d)
220	h -= d;
221
222	if (i) {
223	d <<= i;
224	h = (h << i) \| (l >> (BN_BITS2 - i));
225	l <<= i;
226	}
227	dh = (d & BN_MASK2h) >> BN_BITS4;
228	dl = (d & BN_MASK2l);
229	for (;;) {
230	if ((h >> BN_BITS4) == dh)
231	q = BN_MASK2l;
232	else
233	q = h / dh;
234
235	th = q * dh;
236	tl = dl * q;
237	for (;;) {
238	t = h - th;
239	if ((t & BN_MASK2h) \|\|
240	((tl) <= ((t << BN_BITS4) \| ((l & BN_MASK2h) >> BN_BITS4))))
241	break;
242	q--;
243	th -= dh;
244	tl -= dl;
245	}
246	t = (tl >> BN_BITS4);
247	tl = (tl << BN_BITS4) & BN_MASK2h;
248	th += t;
249
250	if (l < tl)
251	th++;
252	l -= tl;
253	if (h < th) {
254	h += d;
255	q--;
256	}
257	h -= th;
258
259	if (--count == 0)
260	break;
261
262	ret = q << BN_BITS4;
263	h = ((h << BN_BITS4) \| (l >> BN_BITS4)) & BN_MASK2;
264	l = (l & BN_MASK2l) << BN_BITS4;
265	}
266	ret \|= q;
267	return (ret);
268	}
269	#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
270
271	#ifdef BN_LLONG
272	BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
273	int n)
274	{
275	BN_ULLONG ll = 0;
276
277	assert(n >= 0);
278	if (n <= 0)
279	return ((BN_ULONG)0);
280
281	# ifndef OPENSSL_SMALL_FOOTPRINT
282	while (n & ~3) {
283	ll += (BN_ULLONG) a[0] + b[0];
284	r[0] = (BN_ULONG)ll & BN_MASK2;
285	ll >>= BN_BITS2;
286	ll += (BN_ULLONG) a[1] + b[1];
287	r[1] = (BN_ULONG)ll & BN_MASK2;
288	ll >>= BN_BITS2;
289	ll += (BN_ULLONG) a[2] + b[2];
290	r[2] = (BN_ULONG)ll & BN_MASK2;
291	ll >>= BN_BITS2;
292	ll += (BN_ULLONG) a[3] + b[3];
293	r[3] = (BN_ULONG)ll & BN_MASK2;
294	ll >>= BN_BITS2;
295	a += 4;
296	b += 4;
297	r += 4;
298	n -= 4;
299	}
300	# endif
301	while (n) {
302	ll += (BN_ULLONG) a[0] + b[0];
303	r[0] = (BN_ULONG)ll & BN_MASK2;
304	ll >>= BN_BITS2;
305	a++;
306	b++;
307	r++;
308	n--;
309	}
310	return ((BN_ULONG)ll);
311	}
312	#else /* !BN_LLONG */
313	BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
314	int n)
315	{
316	BN_ULONG c, l, t;
317
318	assert(n >= 0);
319	if (n <= 0)
320	return ((BN_ULONG)0);
321
322	c = 0;
323	# ifndef OPENSSL_SMALL_FOOTPRINT
324	while (n & ~3) {
325	t = a[0];
326	t = (t + c) & BN_MASK2;
327	c = (t < c);
328	l = (t + b[0]) & BN_MASK2;
329	c += (l < t);
330	r[0] = l;
331	t = a[1];
332	t = (t + c) & BN_MASK2;
333	c = (t < c);
334	l = (t + b[1]) & BN_MASK2;
335	c += (l < t);
336	r[1] = l;
337	t = a[2];
338	t = (t + c) & BN_MASK2;
339	c = (t < c);
340	l = (t + b[2]) & BN_MASK2;
341	c += (l < t);
342	r[2] = l;
343	t = a[3];
344	t = (t + c) & BN_MASK2;
345	c = (t < c);
346	l = (t + b[3]) & BN_MASK2;
347	c += (l < t);
348	r[3] = l;
349	a += 4;
350	b += 4;
351	r += 4;
352	n -= 4;
353	}
354	# endif
355	while (n) {
356	t = a[0];
357	t = (t + c) & BN_MASK2;
358	c = (t < c);
359	l = (t + b[0]) & BN_MASK2;
360	c += (l < t);
361	r[0] = l;
362	a++;
363	b++;
364	r++;
365	n--;
366	}
367	return ((BN_ULONG)c);
368	}
369	#endif /* !BN_LLONG */
370
371	BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b,
372	int n)
373	{
374	BN_ULONG t1, t2;
375	int c = 0;
376
377	assert(n >= 0);
378	if (n <= 0)
379	return ((BN_ULONG)0);
380
381	#ifndef OPENSSL_SMALL_FOOTPRINT
382	while (n & ~3) {
383	t1 = a[0];
384	t2 = b[0];
385	r[0] = (t1 - t2 - c) & BN_MASK2;
386	if (t1 != t2)
387	c = (t1 < t2);
388	t1 = a[1];
389	t2 = b[1];
390	r[1] = (t1 - t2 - c) & BN_MASK2;
391	if (t1 != t2)
392	c = (t1 < t2);
393	t1 = a[2];
394	t2 = b[2];
395	r[2] = (t1 - t2 - c) & BN_MASK2;
396	if (t1 != t2)
397	c = (t1 < t2);
398	t1 = a[3];
399	t2 = b[3];
400	r[3] = (t1 - t2 - c) & BN_MASK2;
401	if (t1 != t2)
402	c = (t1 < t2);
403	a += 4;
404	b += 4;
405	r += 4;
406	n -= 4;
407	}
408	#endif
409	while (n) {
410	t1 = a[0];
411	t2 = b[0];
412	r[0] = (t1 - t2 - c) & BN_MASK2;
413	if (t1 != t2)
414	c = (t1 < t2);
415	a++;
416	b++;
417	r++;
418	n--;
419	}
420	return (c);
421	}
422
423	#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
424
425	# undef bn_mul_comba8
426	# undef bn_mul_comba4
427	# undef bn_sqr_comba8
428	# undef bn_sqr_comba4
429
430	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
431	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
432	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
433	/*
434	* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number
435	* c=(c2,c1,c0)
436	*/
437
438	# ifdef BN_LLONG
439	/*
440	* Keep in mind that additions to multiplication result can not
441	* overflow, because its high half cannot be all-ones.
442	*/
443	# define mul_add_c(a,b,c0,c1,c2) do { \
444	BN_ULONG hi; \
445	BN_ULLONG t = (BN_ULLONG)(a)*(b); \
446	t += c0; /* no carry */ \
447	c0 = (BN_ULONG)Lw(t); \
448	hi = (BN_ULONG)Hw(t); \
449	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
450	} while(0)
451
452	# define mul_add_c2(a,b,c0,c1,c2) do { \
453	BN_ULONG hi; \
454	BN_ULLONG t = (BN_ULLONG)(a)*(b); \
455	BN_ULLONG tt = t+c0; /* no carry */ \
456	c0 = (BN_ULONG)Lw(tt); \
457	hi = (BN_ULONG)Hw(tt); \
458	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
459	t += c0; /* no carry */ \
460	c0 = (BN_ULONG)Lw(t); \
461	hi = (BN_ULONG)Hw(t); \
462	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
463	} while(0)
464
465	# define sqr_add_c(a,i,c0,c1,c2) do { \
466	BN_ULONG hi; \
467	BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \
468	t += c0; /* no carry */ \
469	c0 = (BN_ULONG)Lw(t); \
470	hi = (BN_ULONG)Hw(t); \
471	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
472	} while(0)
473
474	# define sqr_add_c2(a,i,j,c0,c1,c2) \
475	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
476
477	# elif defined(BN_UMULT_LOHI)
478	/*
479	* Keep in mind that additions to hi can not overflow, because
480	* the high word of a multiplication result cannot be all-ones.
481	*/
482	# define mul_add_c(a,b,c0,c1,c2) do { \
483	BN_ULONG ta = (a), tb = (b); \
484	BN_ULONG lo, hi; \
485	BN_UMULT_LOHI(lo,hi,ta,tb); \
486	c0 += lo; hi += (c0<lo)?1:0; \
487	c1 += hi; c2 += (c1<hi)?1:0; \
488	} while(0)
489
490	# define mul_add_c2(a,b,c0,c1,c2) do { \
491	BN_ULONG ta = (a), tb = (b); \
492	BN_ULONG lo, hi, tt; \
493	BN_UMULT_LOHI(lo,hi,ta,tb); \
494	c0 += lo; tt = hi+((c0<lo)?1:0); \
495	c1 += tt; c2 += (c1<tt)?1:0; \
496	c0 += lo; hi += (c0<lo)?1:0; \
497	c1 += hi; c2 += (c1<hi)?1:0; \
498	} while(0)
499
500	# define sqr_add_c(a,i,c0,c1,c2) do { \
501	BN_ULONG ta = (a)[i]; \
502	BN_ULONG lo, hi; \
503	BN_UMULT_LOHI(lo,hi,ta,ta); \
504	c0 += lo; hi += (c0<lo)?1:0; \
505	c1 += hi; c2 += (c1<hi)?1:0; \
506	} while(0)
507
508	# define sqr_add_c2(a,i,j,c0,c1,c2) \
509	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
510
511	# elif defined(BN_UMULT_HIGH)
512	/*
513	* Keep in mind that additions to hi can not overflow, because
514	* the high word of a multiplication result cannot be all-ones.
515	*/
516	# define mul_add_c(a,b,c0,c1,c2) do { \
517	BN_ULONG ta = (a), tb = (b); \
518	BN_ULONG lo = ta * tb; \
519	BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
520	c0 += lo; hi += (c0<lo)?1:0; \
521	c1 += hi; c2 += (c1<hi)?1:0; \
522	} while(0)
523
524	# define mul_add_c2(a,b,c0,c1,c2) do { \
525	BN_ULONG ta = (a), tb = (b), tt; \
526	BN_ULONG lo = ta * tb; \
527	BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \
528	c0 += lo; tt = hi + ((c0<lo)?1:0); \
529	c1 += tt; c2 += (c1<tt)?1:0; \
530	c0 += lo; hi += (c0<lo)?1:0; \
531	c1 += hi; c2 += (c1<hi)?1:0; \
532	} while(0)
533
534	# define sqr_add_c(a,i,c0,c1,c2) do { \
535	BN_ULONG ta = (a)[i]; \
536	BN_ULONG lo = ta * ta; \
537	BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \
538	c0 += lo; hi += (c0<lo)?1:0; \
539	c1 += hi; c2 += (c1<hi)?1:0; \
540	} while(0)
541
542	# define sqr_add_c2(a,i,j,c0,c1,c2) \
543	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
544
545	# else /* !BN_LLONG */
546	/*
547	* Keep in mind that additions to hi can not overflow, because
548	* the high word of a multiplication result cannot be all-ones.
549	*/
550	# define mul_add_c(a,b,c0,c1,c2) do { \
551	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
552	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
553	mul64(lo,hi,bl,bh); \
554	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
555	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
556	} while(0)
557
558	# define mul_add_c2(a,b,c0,c1,c2) do { \
559	BN_ULONG tt; \
560	BN_ULONG lo = LBITS(a), hi = HBITS(a); \
561	BN_ULONG bl = LBITS(b), bh = HBITS(b); \
562	mul64(lo,hi,bl,bh); \
563	tt = hi; \
564	c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
565	c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
566	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
567	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
568	} while(0)
569
570	# define sqr_add_c(a,i,c0,c1,c2) do { \
571	BN_ULONG lo, hi; \
572	sqr64(lo,hi,(a)[i]); \
573	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
574	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
575	} while(0)
576
577	# define sqr_add_c2(a,i,j,c0,c1,c2) \
578	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
579	# endif /* !BN_LLONG */
580
581	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
582	{
583	BN_ULONG c1, c2, c3;
584
585	c1 = 0;
586	c2 = 0;
587	c3 = 0;
588	mul_add_c(a[0], b[0], c1, c2, c3);
589	r[0] = c1;
590	c1 = 0;
591	mul_add_c(a[0], b[1], c2, c3, c1);
592	mul_add_c(a[1], b[0], c2, c3, c1);
593	r[1] = c2;
594	c2 = 0;
595	mul_add_c(a[2], b[0], c3, c1, c2);
596	mul_add_c(a[1], b[1], c3, c1, c2);
597	mul_add_c(a[0], b[2], c3, c1, c2);
598	r[2] = c3;
599	c3 = 0;
600	mul_add_c(a[0], b[3], c1, c2, c3);
601	mul_add_c(a[1], b[2], c1, c2, c3);
602	mul_add_c(a[2], b[1], c1, c2, c3);
603	mul_add_c(a[3], b[0], c1, c2, c3);
604	r[3] = c1;
605	c1 = 0;
606	mul_add_c(a[4], b[0], c2, c3, c1);
607	mul_add_c(a[3], b[1], c2, c3, c1);
608	mul_add_c(a[2], b[2], c2, c3, c1);
609	mul_add_c(a[1], b[3], c2, c3, c1);
610	mul_add_c(a[0], b[4], c2, c3, c1);
611	r[4] = c2;
612	c2 = 0;
613	mul_add_c(a[0], b[5], c3, c1, c2);
614	mul_add_c(a[1], b[4], c3, c1, c2);
615	mul_add_c(a[2], b[3], c3, c1, c2);
616	mul_add_c(a[3], b[2], c3, c1, c2);
617	mul_add_c(a[4], b[1], c3, c1, c2);
618	mul_add_c(a[5], b[0], c3, c1, c2);
619	r[5] = c3;
620	c3 = 0;
621	mul_add_c(a[6], b[0], c1, c2, c3);
622	mul_add_c(a[5], b[1], c1, c2, c3);
623	mul_add_c(a[4], b[2], c1, c2, c3);
624	mul_add_c(a[3], b[3], c1, c2, c3);
625	mul_add_c(a[2], b[4], c1, c2, c3);
626	mul_add_c(a[1], b[5], c1, c2, c3);
627	mul_add_c(a[0], b[6], c1, c2, c3);
628	r[6] = c1;
629	c1 = 0;
630	mul_add_c(a[0], b[7], c2, c3, c1);
631	mul_add_c(a[1], b[6], c2, c3, c1);
632	mul_add_c(a[2], b[5], c2, c3, c1);
633	mul_add_c(a[3], b[4], c2, c3, c1);
634	mul_add_c(a[4], b[3], c2, c3, c1);
635	mul_add_c(a[5], b[2], c2, c3, c1);
636	mul_add_c(a[6], b[1], c2, c3, c1);
637	mul_add_c(a[7], b[0], c2, c3, c1);
638	r[7] = c2;
639	c2 = 0;
640	mul_add_c(a[7], b[1], c3, c1, c2);
641	mul_add_c(a[6], b[2], c3, c1, c2);
642	mul_add_c(a[5], b[3], c3, c1, c2);
643	mul_add_c(a[4], b[4], c3, c1, c2);
644	mul_add_c(a[3], b[5], c3, c1, c2);
645	mul_add_c(a[2], b[6], c3, c1, c2);
646	mul_add_c(a[1], b[7], c3, c1, c2);
647	r[8] = c3;
648	c3 = 0;
649	mul_add_c(a[2], b[7], c1, c2, c3);
650	mul_add_c(a[3], b[6], c1, c2, c3);
651	mul_add_c(a[4], b[5], c1, c2, c3);
652	mul_add_c(a[5], b[4], c1, c2, c3);
653	mul_add_c(a[6], b[3], c1, c2, c3);
654	mul_add_c(a[7], b[2], c1, c2, c3);
655	r[9] = c1;
656	c1 = 0;
657	mul_add_c(a[7], b[3], c2, c3, c1);
658	mul_add_c(a[6], b[4], c2, c3, c1);
659	mul_add_c(a[5], b[5], c2, c3, c1);
660	mul_add_c(a[4], b[6], c2, c3, c1);
661	mul_add_c(a[3], b[7], c2, c3, c1);
662	r[10] = c2;
663	c2 = 0;
664	mul_add_c(a[4], b[7], c3, c1, c2);
665	mul_add_c(a[5], b[6], c3, c1, c2);
666	mul_add_c(a[6], b[5], c3, c1, c2);
667	mul_add_c(a[7], b[4], c3, c1, c2);
668	r[11] = c3;
669	c3 = 0;
670	mul_add_c(a[7], b[5], c1, c2, c3);
671	mul_add_c(a[6], b[6], c1, c2, c3);
672	mul_add_c(a[5], b[7], c1, c2, c3);
673	r[12] = c1;
674	c1 = 0;
675	mul_add_c(a[6], b[7], c2, c3, c1);
676	mul_add_c(a[7], b[6], c2, c3, c1);
677	r[13] = c2;
678	c2 = 0;
679	mul_add_c(a[7], b[7], c3, c1, c2);
680	r[14] = c3;
681	r[15] = c1;
682	}
683
684	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
685	{
686	BN_ULONG c1, c2, c3;
687
688	c1 = 0;
689	c2 = 0;
690	c3 = 0;
691	mul_add_c(a[0], b[0], c1, c2, c3);
692	r[0] = c1;
693	c1 = 0;
694	mul_add_c(a[0], b[1], c2, c3, c1);
695	mul_add_c(a[1], b[0], c2, c3, c1);
696	r[1] = c2;
697	c2 = 0;
698	mul_add_c(a[2], b[0], c3, c1, c2);
699	mul_add_c(a[1], b[1], c3, c1, c2);
700	mul_add_c(a[0], b[2], c3, c1, c2);
701	r[2] = c3;
702	c3 = 0;
703	mul_add_c(a[0], b[3], c1, c2, c3);
704	mul_add_c(a[1], b[2], c1, c2, c3);
705	mul_add_c(a[2], b[1], c1, c2, c3);
706	mul_add_c(a[3], b[0], c1, c2, c3);
707	r[3] = c1;
708	c1 = 0;
709	mul_add_c(a[3], b[1], c2, c3, c1);
710	mul_add_c(a[2], b[2], c2, c3, c1);
711	mul_add_c(a[1], b[3], c2, c3, c1);
712	r[4] = c2;
713	c2 = 0;
714	mul_add_c(a[2], b[3], c3, c1, c2);
715	mul_add_c(a[3], b[2], c3, c1, c2);
716	r[5] = c3;
717	c3 = 0;
718	mul_add_c(a[3], b[3], c1, c2, c3);
719	r[6] = c1;
720	r[7] = c2;
721	}
722
723	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
724	{
725	BN_ULONG c1, c2, c3;
726
727	c1 = 0;
728	c2 = 0;
729	c3 = 0;
730	sqr_add_c(a, 0, c1, c2, c3);
731	r[0] = c1;
732	c1 = 0;
733	sqr_add_c2(a, 1, 0, c2, c3, c1);
734	r[1] = c2;
735	c2 = 0;
736	sqr_add_c(a, 1, c3, c1, c2);
737	sqr_add_c2(a, 2, 0, c3, c1, c2);
738	r[2] = c3;
739	c3 = 0;
740	sqr_add_c2(a, 3, 0, c1, c2, c3);
741	sqr_add_c2(a, 2, 1, c1, c2, c3);
742	r[3] = c1;
743	c1 = 0;
744	sqr_add_c(a, 2, c2, c3, c1);
745	sqr_add_c2(a, 3, 1, c2, c3, c1);
746	sqr_add_c2(a, 4, 0, c2, c3, c1);
747	r[4] = c2;
748	c2 = 0;
749	sqr_add_c2(a, 5, 0, c3, c1, c2);
750	sqr_add_c2(a, 4, 1, c3, c1, c2);
751	sqr_add_c2(a, 3, 2, c3, c1, c2);
752	r[5] = c3;
753	c3 = 0;
754	sqr_add_c(a, 3, c1, c2, c3);
755	sqr_add_c2(a, 4, 2, c1, c2, c3);
756	sqr_add_c2(a, 5, 1, c1, c2, c3);
757	sqr_add_c2(a, 6, 0, c1, c2, c3);
758	r[6] = c1;
759	c1 = 0;
760	sqr_add_c2(a, 7, 0, c2, c3, c1);
761	sqr_add_c2(a, 6, 1, c2, c3, c1);
762	sqr_add_c2(a, 5, 2, c2, c3, c1);
763	sqr_add_c2(a, 4, 3, c2, c3, c1);
764	r[7] = c2;
765	c2 = 0;
766	sqr_add_c(a, 4, c3, c1, c2);
767	sqr_add_c2(a, 5, 3, c3, c1, c2);
768	sqr_add_c2(a, 6, 2, c3, c1, c2);
769	sqr_add_c2(a, 7, 1, c3, c1, c2);
770	r[8] = c3;
771	c3 = 0;
772	sqr_add_c2(a, 7, 2, c1, c2, c3);
773	sqr_add_c2(a, 6, 3, c1, c2, c3);
774	sqr_add_c2(a, 5, 4, c1, c2, c3);
775	r[9] = c1;
776	c1 = 0;
777	sqr_add_c(a, 5, c2, c3, c1);
778	sqr_add_c2(a, 6, 4, c2, c3, c1);
779	sqr_add_c2(a, 7, 3, c2, c3, c1);
780	r[10] = c2;
781	c2 = 0;
782	sqr_add_c2(a, 7, 4, c3, c1, c2);
783	sqr_add_c2(a, 6, 5, c3, c1, c2);
784	r[11] = c3;
785	c3 = 0;
786	sqr_add_c(a, 6, c1, c2, c3);
787	sqr_add_c2(a, 7, 5, c1, c2, c3);
788	r[12] = c1;
789	c1 = 0;
790	sqr_add_c2(a, 7, 6, c2, c3, c1);
791	r[13] = c2;
792	c2 = 0;
793	sqr_add_c(a, 7, c3, c1, c2);
794	r[14] = c3;
795	r[15] = c1;
796	}
797
798	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
799	{
800	BN_ULONG c1, c2, c3;
801
802	c1 = 0;
803	c2 = 0;
804	c3 = 0;
805	sqr_add_c(a, 0, c1, c2, c3);
806	r[0] = c1;
807	c1 = 0;
808	sqr_add_c2(a, 1, 0, c2, c3, c1);
809	r[1] = c2;
810	c2 = 0;
811	sqr_add_c(a, 1, c3, c1, c2);
812	sqr_add_c2(a, 2, 0, c3, c1, c2);
813	r[2] = c3;
814	c3 = 0;
815	sqr_add_c2(a, 3, 0, c1, c2, c3);
816	sqr_add_c2(a, 2, 1, c1, c2, c3);
817	r[3] = c1;
818	c1 = 0;
819	sqr_add_c(a, 2, c2, c3, c1);
820	sqr_add_c2(a, 3, 1, c2, c3, c1);
821	r[4] = c2;
822	c2 = 0;
823	sqr_add_c2(a, 3, 2, c3, c1, c2);
824	r[5] = c3;
825	c3 = 0;
826	sqr_add_c(a, 3, c1, c2, c3);
827	r[6] = c1;
828	r[7] = c2;
829	}
830
831	# ifdef OPENSSL_NO_ASM
832	# ifdef OPENSSL_BN_ASM_MONT
833	# include <alloca.h>
834	/*
835	* This is essentially reference implementation, which may or may not
836	* result in performance improvement. E.g. on IA-32 this routine was
837	* observed to give 40% faster rsa1024 private key operations and 10%
838	* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
839	* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
840	* reference implementation, one to be used as starting point for
841	* platform-specific assembler. Mentioned numbers apply to compiler
842	* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
843	* can vary not only from platform to platform, but even for compiler
844	* versions. Assembler vs. assembler improvement coefficients can
845	* [and are known to] differ and are to be documented elsewhere.
846	*/
847	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
848	const BN_ULONG np, const BN_ULONG n0p, int num)
849	{
850	BN_ULONG c0, c1, ml, *tp, n0;
851	# ifdef mul64
852	BN_ULONG mh;
853	# endif
854	volatile BN_ULONG *vp;
855	int i = 0, j;
856
857	# if 0 /* template for platform-specific
858	* implementation */
859	if (ap == bp)
860	return bn_sqr_mont(rp, ap, np, n0p, num);
861	# endif
862	vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
863
864	n0 = *n0p;
865
866	c0 = 0;
867	ml = bp[0];
868	# ifdef mul64
869	mh = HBITS(ml);
870	ml = LBITS(ml);
871	for (j = 0; j < num; ++j)
872	mul(tp[j], ap[j], ml, mh, c0);
873	# else
874	for (j = 0; j < num; ++j)
875	mul(tp[j], ap[j], ml, c0);
876	# endif
877
878	tp[num] = c0;
879	tp[num + 1] = 0;
880	goto enter;
881
882	for (i = 0; i < num; i++) {
883	c0 = 0;
884	ml = bp[i];
885	# ifdef mul64
886	mh = HBITS(ml);
887	ml = LBITS(ml);
888	for (j = 0; j < num; ++j)
889	mul_add(tp[j], ap[j], ml, mh, c0);
890	# else
891	for (j = 0; j < num; ++j)
892	mul_add(tp[j], ap[j], ml, c0);
893	# endif
894	c1 = (tp[num] + c0) & BN_MASK2;
895	tp[num] = c1;
896	tp[num + 1] = (c1 < c0 ? 1 : 0);
897	enter:
898	c1 = tp[0];
899	ml = (c1 * n0) & BN_MASK2;
900	c0 = 0;
901	# ifdef mul64
902	mh = HBITS(ml);
903	ml = LBITS(ml);
904	mul_add(c1, np[0], ml, mh, c0);
905	# else
906	mul_add(c1, ml, np[0], c0);
907	# endif
908	for (j = 1; j < num; j++) {
909	c1 = tp[j];
910	# ifdef mul64
911	mul_add(c1, np[j], ml, mh, c0);
912	# else
913	mul_add(c1, ml, np[j], c0);
914	# endif
915	tp[j - 1] = c1 & BN_MASK2;
916	}
917	c1 = (tp[num] + c0) & BN_MASK2;
918	tp[num - 1] = c1;
919	tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
920	}
921
922	if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
923	c0 = bn_sub_words(rp, tp, np, num);
924	if (tp[num] != 0 \|\| c0 == 0) {
925	for (i = 0; i < num + 2; i++)
926	vp[i] = 0;
927	return 1;
928	}
929	}
930	for (i = 0; i < num; i++)
931	rp[i] = tp[i], vp[i] = 0;
932	vp[num] = 0;
933	vp[num + 1] = 0;
934	return 1;
935	}
936	# else
937	/*
938	* Return value of 0 indicates that multiplication/convolution was not
939	* performed to signal the caller to fall down to alternative/original
940	* code-path.
941	*/
942	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
943	const BN_ULONG np, const BN_ULONG n0, int num)
944	{
945	return 0;
946	}
947	# endif /* OPENSSL_BN_ASM_MONT */
948	# endif
949
950	#else /* !BN_MUL_COMBA */
951
952	/* hmm... is it faster just to do a multiply? */
953	# undef bn_sqr_comba4
954	# undef bn_sqr_comba8
955	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
956	{
957	BN_ULONG t[8];
958	bn_sqr_normal(r, a, 4, t);
959	}
960
961	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
962	{
963	BN_ULONG t[16];
964	bn_sqr_normal(r, a, 8, t);
965	}
966
967	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
968	{
969	r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
970	r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
971	r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
972	r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
973	}
974
975	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
976	{
977	r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
978	r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
979	r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
980	r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
981	r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
982	r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
983	r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
984	r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
985	}
986
987	# ifdef OPENSSL_NO_ASM
988	# ifdef OPENSSL_BN_ASM_MONT
989	# include <alloca.h>
990	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
991	const BN_ULONG np, const BN_ULONG n0p, int num)
992	{
993	BN_ULONG c0, c1, tp, n0 = n0p;
994	volatile BN_ULONG *vp;
995	int i = 0, j;
996
997	vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
998
999	for (i = 0; i <= num; i++)
1000	tp[i] = 0;
1001
1002	for (i = 0; i < num; i++) {
1003	c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1004	c1 = (tp[num] + c0) & BN_MASK2;
1005	tp[num] = c1;
1006	tp[num + 1] = (c1 < c0 ? 1 : 0);
1007
1008	c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1009	c1 = (tp[num] + c0) & BN_MASK2;
1010	tp[num] = c1;
1011	tp[num + 1] += (c1 < c0 ? 1 : 0);
1012	for (j = 0; j <= num; j++)
1013	tp[j] = tp[j + 1];
1014	}
1015
1016	if (tp[num] != 0 \|\| tp[num - 1] >= np[num - 1]) {
1017	c0 = bn_sub_words(rp, tp, np, num);
1018	if (tp[num] != 0 \|\| c0 == 0) {
1019	for (i = 0; i < num + 2; i++)
1020	vp[i] = 0;
1021	return 1;
1022	}
1023	}
1024	for (i = 0; i < num; i++)
1025	rp[i] = tp[i], vp[i] = 0;
1026	vp[num] = 0;
1027	vp[num + 1] = 0;
1028	return 1;
1029	}
1030	# else
1031	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,
1032	const BN_ULONG np, const BN_ULONG n0, int num)
1033	{
1034	return 0;
1035	}
1036	# endif /* OPENSSL_BN_ASM_MONT */
1037	# endif
1038
1039	#endif /* !BN_MUL_COMBA */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: EcnlProtoTool/trunk/openssl-1.1.0e/crypto/bn/bn_asm.c@ 331

Download in other formats: