Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

ecp_nistp256.c@ 380

Last change on this file since 380 was 331, checked in by coas-nagasima, 6 years ago
prototoolに関連するプロジェクトをnewlibからmuslを使うよう変更・更新 ntshellをnewlibの下位の実装から、muslのsyscallの実装に変更・更新以下のOSSをアップデート・mruby-1.3.0 ・musl-1.1.18 ・onigmo-6.1.3 ・tcc-0.9.27 以下のOSSを追加・openssl-1.1.0e ・curl-7.57.0 ・zlib-1.2.11 以下のmrbgemsを追加・iij/mruby-digest ・iij/mruby-env ・iij/mruby-errno ・iij/mruby-iijson ・iij/mruby-ipaddr ・iij/mruby-mock ・iij/mruby-require ・iij/mruby-tls-openssl
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc`
File size: 73.2 KB

Rev	Line
[331]	1	/*
	2	* Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
	3	*
	4	* Licensed under the OpenSSL license (the "License"). You may not use
	5	* this file except in compliance with the License. You can obtain a copy
	6	* in the file LICENSE in the source distribution or at
	7	* https://www.openssl.org/source/license.html
	8	*/
	9
	10	/* Copyright 2011 Google Inc.
	11	*
	12	* Licensed under the Apache License, Version 2.0 (the "License");
	13	*
	14	* you may not use this file except in compliance with the License.
	15	* You may obtain a copy of the License at
	16	*
	17	* http://www.apache.org/licenses/LICENSE-2.0
	18	*
	19	* Unless required by applicable law or agreed to in writing, software
	20	* distributed under the License is distributed on an "AS IS" BASIS,
	21	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	22	* See the License for the specific language governing permissions and
	23	* limitations under the License.
	24	*/
	25
	26	/*
	27	* A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
	28	*
	29	* OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
	30	* Otherwise based on Emilia's P224 work, which was inspired by my curve25519
	31	* work which got its smarts from Daniel J. Bernstein's work on the same.
	32	*/
	33
	34	#include <openssl/opensslconf.h>
	35	#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
	36	NON_EMPTY_TRANSLATION_UNIT
	37	#else
	38
	39	# include <stdint.h>
	40	# include <string.h>
	41	# include <openssl/err.h>
	42	# include "ec_lcl.h"
	43
	44	# if defined(__GNUC__) && (__GNUC__ > 3 \|\| (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
	45	/* even with gcc, the typedef won't work for 32-bit platforms */
	46	typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
	47	* platforms */
	48	typedef __int128_t int128_t;
	49	# else
	50	# error "Need GCC 3.1 or later to define type uint128_t"
	51	# endif
	52
	53	typedef uint8_t u8;
	54	typedef uint32_t u32;
	55	typedef uint64_t u64;
	56	typedef int64_t s64;
	57
	58	/*
	59	* The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
	60	* can serialise an element of this field into 32 bytes. We call this an
	61	* felem_bytearray.
	62	*/
	63
	64	typedef u8 felem_bytearray[32];
	65
	66	/*
	67	* These are the parameters of P256, taken from FIPS 186-3, page 86. These
	68	* values are big-endian.
	69	*/
	70	static const felem_bytearray nistp256_curve_params[5] = {
	71	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
	72	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	73	0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
	74	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
	75	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
	76	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	77	0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
	78	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
	79	{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
	80	0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
	81	0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
	82	0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
	83	{0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
	84	0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
	85	0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
	86	0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
	87	{0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
	88	0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
	89	0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
	90	0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
	91	};
	92
	93	/*-
	94	* The representation of field elements.
	95	* ------------------------------------
	96	*
	97	* We represent field elements with either four 128-bit values, eight 128-bit
	98	* values, or four 64-bit values. The field element represented is:
	99	* v[0]2^0 + v[1]2^64 + v[2]2^128 + v[3]2^192 (mod p)
	100	* or:
	101	* v[0]2^0 + v[1]2^64 + v[2]2^128 + ... + v[8]2^512 (mod p)
	102	*
	103	* 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
	104	* apart, but are 128-bits wide, the most significant bits of each limb overlap
	105	* with the least significant bits of the next.
	106	*
	107	* A field element with four limbs is an 'felem'. One with eight limbs is a
	108	* 'longfelem'
	109	*
	110	* A field element with four, 64-bit values is called a 'smallfelem'. Small
	111	* values are used as intermediate values before multiplication.
	112	*/
	113
	114	# define NLIMBS 4
	115
	116	typedef uint128_t limb;
	117	typedef limb felem[NLIMBS];
	118	typedef limb longfelem[NLIMBS * 2];
	119	typedef u64 smallfelem[NLIMBS];
	120
	121	/* This is the value of the prime as four 64-bit words, little-endian. */
	122	static const u64 kPrime[4] =
	123	{ 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
	124	static const u64 bottom63bits = 0x7ffffffffffffffful;
	125
	126	/*
	127	* bin32_to_felem takes a little-endian byte array and converts it into felem
	128	* form. This assumes that the CPU is little-endian.
	129	*/
	130	static void bin32_to_felem(felem out, const u8 in[32])
	131	{
	132	out[0] = ((u64 )&in[0]);
	133	out[1] = ((u64 )&in[8]);
	134	out[2] = ((u64 )&in[16]);
	135	out[3] = ((u64 )&in[24]);
	136	}
	137
	138	/*
	139	* smallfelem_to_bin32 takes a smallfelem and serialises into a little
	140	* endian, 32 byte array. This assumes that the CPU is little-endian.
	141	*/
	142	static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
	143	{
	144	((u64 )&out[0]) = in[0];
	145	((u64 )&out[8]) = in[1];
	146	((u64 )&out[16]) = in[2];
	147	((u64 )&out[24]) = in[3];
	148	}
	149
	150	/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
	151	static void flip_endian(u8 out, const u8 in, unsigned len)
	152	{
	153	unsigned i;
	154	for (i = 0; i < len; ++i)
	155	out[i] = in[len - 1 - i];
	156	}
	157
	158	/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
	159	static int BN_to_felem(felem out, const BIGNUM *bn)
	160	{
	161	felem_bytearray b_in;
	162	felem_bytearray b_out;
	163	unsigned num_bytes;
	164
	165	/* BN_bn2bin eats leading zeroes */
	166	memset(b_out, 0, sizeof(b_out));
	167	num_bytes = BN_num_bytes(bn);
	168	if (num_bytes > sizeof b_out) {
	169	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
	170	return 0;
	171	}
	172	if (BN_is_negative(bn)) {
	173	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
	174	return 0;
	175	}
	176	num_bytes = BN_bn2bin(bn, b_in);
	177	flip_endian(b_out, b_in, num_bytes);
	178	bin32_to_felem(out, b_out);
	179	return 1;
	180	}
	181
	182	/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
	183	static BIGNUM smallfelem_to_BN(BIGNUM out, const smallfelem in)
	184	{
	185	felem_bytearray b_in, b_out;
	186	smallfelem_to_bin32(b_in, in);
	187	flip_endian(b_out, b_in, sizeof b_out);
	188	return BN_bin2bn(b_out, sizeof b_out, out);
	189	}
	190
	191	/*-
	192	* Field operations
	193	* ----------------
	194	*/
	195
	196	static void smallfelem_one(smallfelem out)
	197	{
	198	out[0] = 1;
	199	out[1] = 0;
	200	out[2] = 0;
	201	out[3] = 0;
	202	}
	203
	204	static void smallfelem_assign(smallfelem out, const smallfelem in)
	205	{
	206	out[0] = in[0];
	207	out[1] = in[1];
	208	out[2] = in[2];
	209	out[3] = in[3];
	210	}
	211
	212	static void felem_assign(felem out, const felem in)
	213	{
	214	out[0] = in[0];
	215	out[1] = in[1];
	216	out[2] = in[2];
	217	out[3] = in[3];
	218	}
	219
	220	/* felem_sum sets out = out + in. */
	221	static void felem_sum(felem out, const felem in)
	222	{
	223	out[0] += in[0];
	224	out[1] += in[1];
	225	out[2] += in[2];
	226	out[3] += in[3];
	227	}
	228
	229	/* felem_small_sum sets out = out + in. */
	230	static void felem_small_sum(felem out, const smallfelem in)
	231	{
	232	out[0] += in[0];
	233	out[1] += in[1];
	234	out[2] += in[2];
	235	out[3] += in[3];
	236	}
	237
	238	/* felem_scalar sets out = out * scalar */
	239	static void felem_scalar(felem out, const u64 scalar)
	240	{
	241	out[0] *= scalar;
	242	out[1] *= scalar;
	243	out[2] *= scalar;
	244	out[3] *= scalar;
	245	}
	246
	247	/* longfelem_scalar sets out = out * scalar */
	248	static void longfelem_scalar(longfelem out, const u64 scalar)
	249	{
	250	out[0] *= scalar;
	251	out[1] *= scalar;
	252	out[2] *= scalar;
	253	out[3] *= scalar;
	254	out[4] *= scalar;
	255	out[5] *= scalar;
	256	out[6] *= scalar;
	257	out[7] *= scalar;
	258	}
	259
	260	# define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
	261	# define two105 (((limb)1) << 105)
	262	# define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
	263
	264	/* zero105 is 0 mod p */
	265	static const felem zero105 =
	266	{ two105m41m9, two105, two105m41p9, two105m41p9 };
	267
	268	/*-
	269	* smallfelem_neg sets \|out\| to \|-small\|
	270	* On exit:
	271	* out[i] < out[i] + 2^105
	272	*/
	273	static void smallfelem_neg(felem out, const smallfelem small)
	274	{
	275	/* In order to prevent underflow, we subtract from 0 mod p. */
	276	out[0] = zero105[0] - small[0];
	277	out[1] = zero105[1] - small[1];
	278	out[2] = zero105[2] - small[2];
	279	out[3] = zero105[3] - small[3];
	280	}
	281
	282	/*-
	283	* felem_diff subtracts \|in\| from \|out\|
	284	* On entry:
	285	* in[i] < 2^104
	286	* On exit:
	287	* out[i] < out[i] + 2^105
	288	*/
	289	static void felem_diff(felem out, const felem in)
	290	{
	291	/*
	292	* In order to prevent underflow, we add 0 mod p before subtracting.
	293	*/
	294	out[0] += zero105[0];
	295	out[1] += zero105[1];
	296	out[2] += zero105[2];
	297	out[3] += zero105[3];
	298
	299	out[0] -= in[0];
	300	out[1] -= in[1];
	301	out[2] -= in[2];
	302	out[3] -= in[3];
	303	}
	304
	305	# define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
	306	# define two107 (((limb)1) << 107)
	307	# define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
	308
	309	/* zero107 is 0 mod p */
	310	static const felem zero107 =
	311	{ two107m43m11, two107, two107m43p11, two107m43p11 };
	312
	313	/*-
	314	* An alternative felem_diff for larger inputs \|in\|
	315	* felem_diff_zero107 subtracts \|in\| from \|out\|
	316	* On entry:
	317	* in[i] < 2^106
	318	* On exit:
	319	* out[i] < out[i] + 2^107
	320	*/
	321	static void felem_diff_zero107(felem out, const felem in)
	322	{
	323	/*
	324	* In order to prevent underflow, we add 0 mod p before subtracting.
	325	*/
	326	out[0] += zero107[0];
	327	out[1] += zero107[1];
	328	out[2] += zero107[2];
	329	out[3] += zero107[3];
	330
	331	out[0] -= in[0];
	332	out[1] -= in[1];
	333	out[2] -= in[2];
	334	out[3] -= in[3];
	335	}
	336
	337	/*-
	338	* longfelem_diff subtracts \|in\| from \|out\|
	339	* On entry:
	340	* in[i] < 7*2^67
	341	* On exit:
	342	* out[i] < out[i] + 2^70 + 2^40
	343	*/
	344	static void longfelem_diff(longfelem out, const longfelem in)
	345	{
	346	static const limb two70m8p6 =
	347	(((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
	348	static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
	349	static const limb two70 = (((limb) 1) << 70);
	350	static const limb two70m40m38p6 =
	351	(((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
	352	(((limb) 1) << 6);
	353	static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
	354
	355	/* add 0 mod p to avoid underflow */
	356	out[0] += two70m8p6;
	357	out[1] += two70p40;
	358	out[2] += two70;
	359	out[3] += two70m40m38p6;
	360	out[4] += two70m6;
	361	out[5] += two70m6;
	362	out[6] += two70m6;
	363	out[7] += two70m6;
	364
	365	/* in[i] < 72^67 < 2^70 - 2^40 - 2^38 + 2^6 /
	366	out[0] -= in[0];
	367	out[1] -= in[1];
	368	out[2] -= in[2];
	369	out[3] -= in[3];
	370	out[4] -= in[4];
	371	out[5] -= in[5];
	372	out[6] -= in[6];
	373	out[7] -= in[7];
	374	}
	375
	376	# define two64m0 (((limb)1) << 64) - 1
	377	# define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
	378	# define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
	379	# define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
	380
	381	/* zero110 is 0 mod p */
	382	static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
	383
	384	/*-
	385	* felem_shrink converts an felem into a smallfelem. The result isn't quite
	386	* minimal as the value may be greater than p.
	387	*
	388	* On entry:
	389	* in[i] < 2^109
	390	* On exit:
	391	* out[i] < 2^64
	392	*/
	393	static void felem_shrink(smallfelem out, const felem in)
	394	{
	395	felem tmp;
	396	u64 a, b, mask;
	397	s64 high, low;
	398	static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
	399
	400	/* Carry 2->3 */
	401	tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
	402	/* tmp[3] < 2^110 */
	403
	404	tmp[2] = zero110[2] + (u64)in[2];
	405	tmp[0] = zero110[0] + in[0];
	406	tmp[1] = zero110[1] + in[1];
	407	/* tmp[0] < 2110, tmp[1] < 2^111, tmp[2] < 265 */
	408
	409	/*
	410	* We perform two partial reductions where we eliminate the high-word of
	411	* tmp[3]. We don't update the other words till the end.
	412	*/
	413	a = tmp[3] >> 64; /* a < 2^46 */
	414	tmp[3] = (u64)tmp[3];
	415	tmp[3] -= a;
	416	tmp[3] += ((limb) a) << 32;
	417	/* tmp[3] < 2^79 */
	418
	419	b = a;
	420	a = tmp[3] >> 64; /* a < 2^15 */
	421	b += a; /* b < 2^46 + 2^15 < 2^47 */
	422	tmp[3] = (u64)tmp[3];
	423	tmp[3] -= a;
	424	tmp[3] += ((limb) a) << 32;
	425	/* tmp[3] < 2^64 + 2^47 */
	426
	427	/*
	428	* This adjusts the other two words to complete the two partial
	429	* reductions.
	430	*/
	431	tmp[0] += b;
	432	tmp[1] -= (((limb) b) << 32);
	433
	434	/*
	435	* In order to make space in tmp[3] for the carry from 2 -> 3, we
	436	* conditionally subtract kPrime if tmp[3] is large enough.
	437	*/
	438	high = tmp[3] >> 64;
	439	/* As tmp[3] < 2^65, high is either 1 or 0 */
	440	high <<= 63;
	441	high >>= 63;
	442	/*-
	443	* high is:
	444	* all ones if the high word of tmp[3] is 1
	445	* all zeros if the high word of tmp[3] if 0 */
	446	low = tmp[3];
	447	mask = low >> 63;
	448	/*-
	449	* mask is:
	450	* all ones if the MSB of low is 1
	451	* all zeros if the MSB of low if 0 */
	452	low &= bottom63bits;
	453	low -= kPrime3Test;
	454	/* if low was greater than kPrime3Test then the MSB is zero */
	455	low = ~low;
	456	low >>= 63;
	457	/*-
	458	* low is:
	459	* all ones if low was > kPrime3Test
	460	* all zeros if low was <= kPrime3Test */
	461	mask = (mask & low) \| high;
	462	tmp[0] -= mask & kPrime[0];
	463	tmp[1] -= mask & kPrime[1];
	464	/* kPrime[2] is zero, so omitted */
	465	tmp[3] -= mask & kPrime[3];
	466	/* tmp[3] < 264 - 232 + 1 */
	467
	468	tmp[1] += ((u64)(tmp[0] >> 64));
	469	tmp[0] = (u64)tmp[0];
	470	tmp[2] += ((u64)(tmp[1] >> 64));
	471	tmp[1] = (u64)tmp[1];
	472	tmp[3] += ((u64)(tmp[2] >> 64));
	473	tmp[2] = (u64)tmp[2];
	474	/* tmp[i] < 2^64 */
	475
	476	out[0] = tmp[0];
	477	out[1] = tmp[1];
	478	out[2] = tmp[2];
	479	out[3] = tmp[3];
	480	}
	481
	482	/* smallfelem_expand converts a smallfelem to an felem */
	483	static void smallfelem_expand(felem out, const smallfelem in)
	484	{
	485	out[0] = in[0];
	486	out[1] = in[1];
	487	out[2] = in[2];
	488	out[3] = in[3];
	489	}
	490
	491	/*-
	492	* smallfelem_square sets \|out\| = \|small\|^2
	493	* On entry:
	494	* small[i] < 2^64
	495	* On exit:
	496	* out[i] < 7 * 2^64 < 2^67
	497	*/
	498	static void smallfelem_square(longfelem out, const smallfelem small)
	499	{
	500	limb a;
	501	u64 high, low;
	502
	503	a = ((uint128_t) small[0]) * small[0];
	504	low = a;
	505	high = a >> 64;
	506	out[0] = low;
	507	out[1] = high;
	508
	509	a = ((uint128_t) small[0]) * small[1];
	510	low = a;
	511	high = a >> 64;
	512	out[1] += low;
	513	out[1] += low;
	514	out[2] = high;
	515
	516	a = ((uint128_t) small[0]) * small[2];
	517	low = a;
	518	high = a >> 64;
	519	out[2] += low;
	520	out[2] *= 2;
	521	out[3] = high;
	522
	523	a = ((uint128_t) small[0]) * small[3];
	524	low = a;
	525	high = a >> 64;
	526	out[3] += low;
	527	out[4] = high;
	528
	529	a = ((uint128_t) small[1]) * small[2];
	530	low = a;
	531	high = a >> 64;
	532	out[3] += low;
	533	out[3] *= 2;
	534	out[4] += high;
	535
	536	a = ((uint128_t) small[1]) * small[1];
	537	low = a;
	538	high = a >> 64;
	539	out[2] += low;
	540	out[3] += high;
	541
	542	a = ((uint128_t) small[1]) * small[3];
	543	low = a;
	544	high = a >> 64;
	545	out[4] += low;
	546	out[4] *= 2;
	547	out[5] = high;
	548
	549	a = ((uint128_t) small[2]) * small[3];
	550	low = a;
	551	high = a >> 64;
	552	out[5] += low;
	553	out[5] *= 2;
	554	out[6] = high;
	555	out[6] += high;
	556
	557	a = ((uint128_t) small[2]) * small[2];
	558	low = a;
	559	high = a >> 64;
	560	out[4] += low;
	561	out[5] += high;
	562
	563	a = ((uint128_t) small[3]) * small[3];
	564	low = a;
	565	high = a >> 64;
	566	out[6] += low;
	567	out[7] = high;
	568	}
	569
	570	/*-
	571	* felem_square sets \|out\| = \|in\|^2
	572	* On entry:
	573	* in[i] < 2^109
	574	* On exit:
	575	* out[i] < 7 * 2^64 < 2^67
	576	*/
	577	static void felem_square(longfelem out, const felem in)
	578	{
	579	u64 small[4];
	580	felem_shrink(small, in);
	581	smallfelem_square(out, small);
	582	}
	583
	584	/*-
	585	* smallfelem_mul sets \|out\| = \|small1\| * \|small2\|
	586	* On entry:
	587	* small1[i] < 2^64
	588	* small2[i] < 2^64
	589	* On exit:
	590	* out[i] < 7 * 2^64 < 2^67
	591	*/
	592	static void smallfelem_mul(longfelem out, const smallfelem small1,
	593	const smallfelem small2)
	594	{
	595	limb a;
	596	u64 high, low;
	597
	598	a = ((uint128_t) small1[0]) * small2[0];
	599	low = a;
	600	high = a >> 64;
	601	out[0] = low;
	602	out[1] = high;
	603
	604	a = ((uint128_t) small1[0]) * small2[1];
	605	low = a;
	606	high = a >> 64;
	607	out[1] += low;
	608	out[2] = high;
	609
	610	a = ((uint128_t) small1[1]) * small2[0];
	611	low = a;
	612	high = a >> 64;
	613	out[1] += low;
	614	out[2] += high;
	615
	616	a = ((uint128_t) small1[0]) * small2[2];
	617	low = a;
	618	high = a >> 64;
	619	out[2] += low;
	620	out[3] = high;
	621
	622	a = ((uint128_t) small1[1]) * small2[1];
	623	low = a;
	624	high = a >> 64;
	625	out[2] += low;
	626	out[3] += high;
	627
	628	a = ((uint128_t) small1[2]) * small2[0];
	629	low = a;
	630	high = a >> 64;
	631	out[2] += low;
	632	out[3] += high;
	633
	634	a = ((uint128_t) small1[0]) * small2[3];
	635	low = a;
	636	high = a >> 64;
	637	out[3] += low;
	638	out[4] = high;
	639
	640	a = ((uint128_t) small1[1]) * small2[2];
	641	low = a;
	642	high = a >> 64;
	643	out[3] += low;
	644	out[4] += high;
	645
	646	a = ((uint128_t) small1[2]) * small2[1];
	647	low = a;
	648	high = a >> 64;
	649	out[3] += low;
	650	out[4] += high;
	651
	652	a = ((uint128_t) small1[3]) * small2[0];
	653	low = a;
	654	high = a >> 64;
	655	out[3] += low;
	656	out[4] += high;
	657
	658	a = ((uint128_t) small1[1]) * small2[3];
	659	low = a;
	660	high = a >> 64;
	661	out[4] += low;
	662	out[5] = high;
	663
	664	a = ((uint128_t) small1[2]) * small2[2];
	665	low = a;
	666	high = a >> 64;
	667	out[4] += low;
	668	out[5] += high;
	669
	670	a = ((uint128_t) small1[3]) * small2[1];
	671	low = a;
	672	high = a >> 64;
	673	out[4] += low;
	674	out[5] += high;
	675
	676	a = ((uint128_t) small1[2]) * small2[3];
	677	low = a;
	678	high = a >> 64;
	679	out[5] += low;
	680	out[6] = high;
	681
	682	a = ((uint128_t) small1[3]) * small2[2];
	683	low = a;
	684	high = a >> 64;
	685	out[5] += low;
	686	out[6] += high;
	687
	688	a = ((uint128_t) small1[3]) * small2[3];
	689	low = a;
	690	high = a >> 64;
	691	out[6] += low;
	692	out[7] = high;
	693	}
	694
	695	/*-
	696	* felem_mul sets \|out\| = \|in1\| * \|in2\|
	697	* On entry:
	698	* in1[i] < 2^109
	699	* in2[i] < 2^109
	700	* On exit:
	701	* out[i] < 7 * 2^64 < 2^67
	702	*/
	703	static void felem_mul(longfelem out, const felem in1, const felem in2)
	704	{
	705	smallfelem small1, small2;
	706	felem_shrink(small1, in1);
	707	felem_shrink(small2, in2);
	708	smallfelem_mul(out, small1, small2);
	709	}
	710
	711	/*-
	712	* felem_small_mul sets \|out\| = \|small1\| * \|in2\|
	713	* On entry:
	714	* small1[i] < 2^64
	715	* in2[i] < 2^109
	716	* On exit:
	717	* out[i] < 7 * 2^64 < 2^67
	718	*/
	719	static void felem_small_mul(longfelem out, const smallfelem small1,
	720	const felem in2)
	721	{
	722	smallfelem small2;
	723	felem_shrink(small2, in2);
	724	smallfelem_mul(out, small1, small2);
	725	}
	726
	727	# define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
	728	# define two100 (((limb)1) << 100)
	729	# define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
	730	/* zero100 is 0 mod p */
	731	static const felem zero100 =
	732	{ two100m36m4, two100, two100m36p4, two100m36p4 };
	733
	734	/*-
	735	* Internal function for the different flavours of felem_reduce.
	736	* felem_reduce_ reduces the higher coefficients in[4]-in[7].
	737	* On entry:
	738	* out[0] >= in[6] + 2^32in[6] + in[7] + 2^32in[7]
	739	* out[1] >= in[7] + 2^32*in[4]
	740	* out[2] >= in[5] + 2^32*in[5]
	741	* out[3] >= in[4] + 2^32in[5] + 2^32in[6]
	742	* On exit:
	743	* out[0] <= out[0] + in[4] + 2^32*in[5]
	744	* out[1] <= out[1] + in[5] + 2^33*in[6]
	745	* out[2] <= out[2] + in[7] + 2in[6] + 2^33in[7]
	746	* out[3] <= out[3] + 2^32in[4] + 3in[7]
	747	*/
	748	static void felem_reduce_(felem out, const longfelem in)
	749	{
	750	int128_t c;
	751	/* combine common terms from below */
	752	c = in[4] + (in[5] << 32);
	753	out[0] += c;
	754	out[3] -= c;
	755
	756	c = in[5] - in[7];
	757	out[1] += c;
	758	out[2] -= c;
	759
	760	/* the remaining terms */
	761	/* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
	762	out[1] -= (in[4] << 32);
	763	out[3] += (in[4] << 32);
	764
	765	/* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
	766	out[2] -= (in[5] << 32);
	767
	768	/* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
	769	out[0] -= in[6];
	770	out[0] -= (in[6] << 32);
	771	out[1] += (in[6] << 33);
	772	out[2] += (in[6] * 2);
	773	out[3] -= (in[6] << 32);
	774
	775	/* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
	776	out[0] -= in[7];
	777	out[0] -= (in[7] << 32);
	778	out[2] += (in[7] << 33);
	779	out[3] += (in[7] * 3);
	780	}
	781
	782	/*-
	783	* felem_reduce converts a longfelem into an felem.
	784	* To be called directly after felem_square or felem_mul.
	785	* On entry:
	786	* in[0] < 2^64, in[1] < 32^64, in[2] < 52^64, in[3] < 7*2^64
	787	* in[4] < 72^64, in[5] < 52^64, in[6] < 32^64, in[7] < 264
	788	* On exit:
	789	* out[i] < 2^101
	790	*/
	791	static void felem_reduce(felem out, const longfelem in)
	792	{
	793	out[0] = zero100[0] + in[0];
	794	out[1] = zero100[1] + in[1];
	795	out[2] = zero100[2] + in[2];
	796	out[3] = zero100[3] + in[3];
	797
	798	felem_reduce_(out, in);
	799
	800	/*-
	801	* out[0] > 2^100 - 2^36 - 2^4 - 32^64 - 32^96 - 2^64 - 2^96 > 0
	802	* out[1] > 2^100 - 2^64 - 7*2^96 > 0
	803	* out[2] > 2^100 - 2^36 + 2^4 - 52^64 - 52^96 > 0
	804	* out[3] > 2^100 - 2^36 + 2^4 - 72^64 - 52^96 - 3*2^96 > 0
	805	*
	806	* out[0] < 2^100 + 2^64 + 72^64 + 52^96 < 2^101
	807	* out[1] < 2^100 + 32^64 + 52^64 + 3*2^97 < 2^101
	808	* out[2] < 2^100 + 52^64 + 2^64 + 32^65 + 2^97 < 2^101
	809	* out[3] < 2^100 + 72^64 + 72^96 + 3*2^64 < 2^101
	810	*/
	811	}
	812
	813	/*-
	814	* felem_reduce_zero105 converts a larger longfelem into an felem.
	815	* On entry:
	816	* in[0] < 2^71
	817	* On exit:
	818	* out[i] < 2^106
	819	*/
	820	static void felem_reduce_zero105(felem out, const longfelem in)
	821	{
	822	out[0] = zero105[0] + in[0];
	823	out[1] = zero105[1] + in[1];
	824	out[2] = zero105[2] + in[2];
	825	out[3] = zero105[3] + in[3];
	826
	827	felem_reduce_(out, in);
	828
	829	/*-
	830	* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
	831	* out[1] > 2^105 - 2^71 - 2^103 > 0
	832	* out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
	833	* out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
	834	*
	835	* out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
	836	* out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
	837	* out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
	838	* out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
	839	*/
	840	}
	841
	842	/*
	843	* subtract_u64 sets result = result - v and *carry to one if the
	844	* subtraction underflowed.
	845	*/
	846	static void subtract_u64(u64 result, u64 carry, u64 v)
	847	{
	848	uint128_t r = *result;
	849	r -= v;
	850	*carry = (r >> 64) & 1;
	851	*result = (u64)r;
	852	}
	853
	854	/*
	855	* felem_contract converts \|in\| to its unique, minimal representation. On
	856	* entry: in[i] < 2^109
	857	*/
	858	static void felem_contract(smallfelem out, const felem in)
	859	{
	860	unsigned i;
	861	u64 all_equal_so_far = 0, result = 0, carry;
	862
	863	felem_shrink(out, in);
	864	/* small is minimal except that the value might be > p */
	865
	866	all_equal_so_far--;
	867	/*
	868	* We are doing a constant time test if out >= kPrime. We need to compare
	869	* each u64, from most-significant to least significant. For each one, if
	870	* all words so far have been equal (m is all ones) then a non-equal
	871	* result is the answer. Otherwise we continue.
	872	*/
	873	for (i = 3; i < 4; i--) {
	874	u64 equal;
	875	uint128_t a = ((uint128_t) kPrime[i]) - out[i];
	876	/*
	877	* if out[i] > kPrime[i] then a will underflow and the high 64-bits
	878	* will all be set.
	879	*/
	880	result \|= all_equal_so_far & ((u64)(a >> 64));
	881
	882	/*
	883	* if kPrime[i] == out[i] then \|equal\| will be all zeros and the
	884	* decrement will make it all ones.
	885	*/
	886	equal = kPrime[i] ^ out[i];
	887	equal--;
	888	equal &= equal << 32;
	889	equal &= equal << 16;
	890	equal &= equal << 8;
	891	equal &= equal << 4;
	892	equal &= equal << 2;
	893	equal &= equal << 1;
	894	equal = ((s64) equal) >> 63;
	895
	896	all_equal_so_far &= equal;
	897	}
	898
	899	/*
	900	* if all_equal_so_far is still all ones then the two values are equal
	901	* and so out >= kPrime is true.
	902	*/
	903	result \|= all_equal_so_far;
	904
	905	/* if out >= kPrime then we subtract kPrime. */
	906	subtract_u64(&out[0], &carry, result & kPrime[0]);
	907	subtract_u64(&out[1], &carry, carry);
	908	subtract_u64(&out[2], &carry, carry);
	909	subtract_u64(&out[3], &carry, carry);
	910
	911	subtract_u64(&out[1], &carry, result & kPrime[1]);
	912	subtract_u64(&out[2], &carry, carry);
	913	subtract_u64(&out[3], &carry, carry);
	914
	915	subtract_u64(&out[2], &carry, result & kPrime[2]);
	916	subtract_u64(&out[3], &carry, carry);
	917
	918	subtract_u64(&out[3], &carry, result & kPrime[3]);
	919	}
	920
	921	static void smallfelem_square_contract(smallfelem out, const smallfelem in)
	922	{
	923	longfelem longtmp;
	924	felem tmp;
	925
	926	smallfelem_square(longtmp, in);
	927	felem_reduce(tmp, longtmp);
	928	felem_contract(out, tmp);
	929	}
	930
	931	static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
	932	const smallfelem in2)
	933	{
	934	longfelem longtmp;
	935	felem tmp;
	936
	937	smallfelem_mul(longtmp, in1, in2);
	938	felem_reduce(tmp, longtmp);
	939	felem_contract(out, tmp);
	940	}
	941
	942	/*-
	943	* felem_is_zero returns a limb with all bits set if \|in\| == 0 (mod p) and 0
	944	* otherwise.
	945	* On entry:
	946	* small[i] < 2^64
	947	*/
	948	static limb smallfelem_is_zero(const smallfelem small)
	949	{
	950	limb result;
	951	u64 is_p;
	952
	953	u64 is_zero = small[0] \| small[1] \| small[2] \| small[3];
	954	is_zero--;
	955	is_zero &= is_zero << 32;
	956	is_zero &= is_zero << 16;
	957	is_zero &= is_zero << 8;
	958	is_zero &= is_zero << 4;
	959	is_zero &= is_zero << 2;
	960	is_zero &= is_zero << 1;
	961	is_zero = ((s64) is_zero) >> 63;
	962
	963	is_p = (small[0] ^ kPrime[0]) \|
	964	(small[1] ^ kPrime[1]) \|
	965	(small[2] ^ kPrime[2]) \| (small[3] ^ kPrime[3]);
	966	is_p--;
	967	is_p &= is_p << 32;
	968	is_p &= is_p << 16;
	969	is_p &= is_p << 8;
	970	is_p &= is_p << 4;
	971	is_p &= is_p << 2;
	972	is_p &= is_p << 1;
	973	is_p = ((s64) is_p) >> 63;
	974
	975	is_zero \|= is_p;
	976
	977	result = is_zero;
	978	result \|= ((limb) is_zero) << 64;
	979	return result;
	980	}
	981
	982	static int smallfelem_is_zero_int(const smallfelem small)
	983	{
	984	return (int)(smallfelem_is_zero(small) & ((limb) 1));
	985	}
	986
	987	/*-
	988	* felem_inv calculates \|out\| = \|in\|^{-1}
	989	*
	990	* Based on Fermat's Little Theorem:
	991	* a^p = a (mod p)
	992	* a^{p-1} = 1 (mod p)
	993	* a^{p-2} = a^{-1} (mod p)
	994	*/
	995	static void felem_inv(felem out, const felem in)
	996	{
	997	felem ftmp, ftmp2;
	998	/* each e_I will hold \|in\|^{2^I - 1} */
	999	felem e2, e4, e8, e16, e32, e64;
	1000	longfelem tmp;
	1001	unsigned i;
	1002
	1003	felem_square(tmp, in);
	1004	felem_reduce(ftmp, tmp); /* 2^1 */
	1005	felem_mul(tmp, in, ftmp);
	1006	felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
	1007	felem_assign(e2, ftmp);
	1008	felem_square(tmp, ftmp);
	1009	felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
	1010	felem_square(tmp, ftmp);
	1011	felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */
	1012	felem_mul(tmp, ftmp, e2);
	1013	felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */
	1014	felem_assign(e4, ftmp);
	1015	felem_square(tmp, ftmp);
	1016	felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */
	1017	felem_square(tmp, ftmp);
	1018	felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */
	1019	felem_square(tmp, ftmp);
	1020	felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */
	1021	felem_square(tmp, ftmp);
	1022	felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */
	1023	felem_mul(tmp, ftmp, e4);
	1024	felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */
	1025	felem_assign(e8, ftmp);
	1026	for (i = 0; i < 8; i++) {
	1027	felem_square(tmp, ftmp);
	1028	felem_reduce(ftmp, tmp);
	1029	} /* 2^16 - 2^8 */
	1030	felem_mul(tmp, ftmp, e8);
	1031	felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */
	1032	felem_assign(e16, ftmp);
	1033	for (i = 0; i < 16; i++) {
	1034	felem_square(tmp, ftmp);
	1035	felem_reduce(ftmp, tmp);
	1036	} /* 2^32 - 2^16 */
	1037	felem_mul(tmp, ftmp, e16);
	1038	felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */
	1039	felem_assign(e32, ftmp);
	1040	for (i = 0; i < 32; i++) {
	1041	felem_square(tmp, ftmp);
	1042	felem_reduce(ftmp, tmp);
	1043	} /* 2^64 - 2^32 */
	1044	felem_assign(e64, ftmp);
	1045	felem_mul(tmp, ftmp, in);
	1046	felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */
	1047	for (i = 0; i < 192; i++) {
	1048	felem_square(tmp, ftmp);
	1049	felem_reduce(ftmp, tmp);
	1050	} /* 2^256 - 2^224 + 2^192 */
	1051
	1052	felem_mul(tmp, e64, e32);
	1053	felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */
	1054	for (i = 0; i < 16; i++) {
	1055	felem_square(tmp, ftmp2);
	1056	felem_reduce(ftmp2, tmp);
	1057	} /* 2^80 - 2^16 */
	1058	felem_mul(tmp, ftmp2, e16);
	1059	felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */
	1060	for (i = 0; i < 8; i++) {
	1061	felem_square(tmp, ftmp2);
	1062	felem_reduce(ftmp2, tmp);
	1063	} /* 2^88 - 2^8 */
	1064	felem_mul(tmp, ftmp2, e8);
	1065	felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */
	1066	for (i = 0; i < 4; i++) {
	1067	felem_square(tmp, ftmp2);
	1068	felem_reduce(ftmp2, tmp);
	1069	} /* 2^92 - 2^4 */
	1070	felem_mul(tmp, ftmp2, e4);
	1071	felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */
	1072	felem_square(tmp, ftmp2);
	1073	felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */
	1074	felem_square(tmp, ftmp2);
	1075	felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */
	1076	felem_mul(tmp, ftmp2, e2);
	1077	felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */
	1078	felem_square(tmp, ftmp2);
	1079	felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */
	1080	felem_square(tmp, ftmp2);
	1081	felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */
	1082	felem_mul(tmp, ftmp2, in);
	1083	felem_reduce(ftmp2, tmp); /* 2^96 - 3 */
	1084
	1085	felem_mul(tmp, ftmp2, ftmp);
	1086	felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
	1087	}
	1088
	1089	static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
	1090	{
	1091	felem tmp;
	1092
	1093	smallfelem_expand(tmp, in);
	1094	felem_inv(tmp, tmp);
	1095	felem_contract(out, tmp);
	1096	}
	1097
	1098	/*-
	1099	* Group operations
	1100	* ----------------
	1101	*
	1102	* Building on top of the field operations we have the operations on the
	1103	* elliptic curve group itself. Points on the curve are represented in Jacobian
	1104	* coordinates
	1105	*/
	1106
	1107	/*-
	1108	* point_double calculates 2*(x_in, y_in, z_in)
	1109	*
	1110	* The method is taken from:
	1111	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
	1112	*
	1113	* Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
	1114	* while x_out == y_in is not (maybe this works, but it's not tested).
	1115	*/
	1116	static void
	1117	point_double(felem x_out, felem y_out, felem z_out,
	1118	const felem x_in, const felem y_in, const felem z_in)
	1119	{
	1120	longfelem tmp, tmp2;
	1121	felem delta, gamma, beta, alpha, ftmp, ftmp2;
	1122	smallfelem small1, small2;
	1123
	1124	felem_assign(ftmp, x_in);
	1125	/* ftmp[i] < 2^106 */
	1126	felem_assign(ftmp2, x_in);
	1127	/* ftmp2[i] < 2^106 */
	1128
	1129	/* delta = z^2 */
	1130	felem_square(tmp, z_in);
	1131	felem_reduce(delta, tmp);
	1132	/* delta[i] < 2^101 */
	1133
	1134	/* gamma = y^2 */
	1135	felem_square(tmp, y_in);
	1136	felem_reduce(gamma, tmp);
	1137	/* gamma[i] < 2^101 */
	1138	felem_shrink(small1, gamma);
	1139
	1140	/* beta = xgamma /
	1141	felem_small_mul(tmp, small1, x_in);
	1142	felem_reduce(beta, tmp);
	1143	/* beta[i] < 2^101 */
	1144
	1145	/* alpha = 3(x-delta)(x+delta) */
	1146	felem_diff(ftmp, delta);
	1147	/* ftmp[i] < 2^105 + 2^106 < 2^107 */
	1148	felem_sum(ftmp2, delta);
	1149	/* ftmp2[i] < 2^105 + 2^106 < 2^107 */
	1150	felem_scalar(ftmp2, 3);
	1151	/* ftmp2[i] < 3 * 2^107 < 2^109 */
	1152	felem_mul(tmp, ftmp, ftmp2);
	1153	felem_reduce(alpha, tmp);
	1154	/* alpha[i] < 2^101 */
	1155	felem_shrink(small2, alpha);
	1156
	1157	/* x' = alpha^2 - 8beta /
	1158	smallfelem_square(tmp, small2);
	1159	felem_reduce(x_out, tmp);
	1160	felem_assign(ftmp, beta);
	1161	felem_scalar(ftmp, 8);
	1162	/* ftmp[i] < 8 * 2^101 = 2^104 */
	1163	felem_diff(x_out, ftmp);
	1164	/* x_out[i] < 2^105 + 2^101 < 2^106 */
	1165
	1166	/* z' = (y + z)^2 - gamma - delta */
	1167	felem_sum(delta, gamma);
	1168	/* delta[i] < 2^101 + 2^101 = 2^102 */
	1169	felem_assign(ftmp, y_in);
	1170	felem_sum(ftmp, z_in);
	1171	/* ftmp[i] < 2^106 + 2^106 = 2^107 */
	1172	felem_square(tmp, ftmp);
	1173	felem_reduce(z_out, tmp);
	1174	felem_diff(z_out, delta);
	1175	/* z_out[i] < 2^105 + 2^101 < 2^106 */
	1176
	1177	/* y' = alpha(4beta - x') - 8gamma^2 /
	1178	felem_scalar(beta, 4);
	1179	/* beta[i] < 4 * 2^101 = 2^103 */
	1180	felem_diff_zero107(beta, x_out);
	1181	/* beta[i] < 2^107 + 2^103 < 2^108 */
	1182	felem_small_mul(tmp, small2, beta);
	1183	/* tmp[i] < 7 * 2^64 < 2^67 */
	1184	smallfelem_square(tmp2, small1);
	1185	/* tmp2[i] < 7 * 2^64 */
	1186	longfelem_scalar(tmp2, 8);
	1187	/* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
	1188	longfelem_diff(tmp, tmp2);
	1189	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
	1190	felem_reduce_zero105(y_out, tmp);
	1191	/* y_out[i] < 2^106 */
	1192	}
	1193
	1194	/*
	1195	* point_double_small is the same as point_double, except that it operates on
	1196	* smallfelems
	1197	*/
	1198	static void
	1199	point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
	1200	const smallfelem x_in, const smallfelem y_in,
	1201	const smallfelem z_in)
	1202	{
	1203	felem felem_x_out, felem_y_out, felem_z_out;
	1204	felem felem_x_in, felem_y_in, felem_z_in;
	1205
	1206	smallfelem_expand(felem_x_in, x_in);
	1207	smallfelem_expand(felem_y_in, y_in);
	1208	smallfelem_expand(felem_z_in, z_in);
	1209	point_double(felem_x_out, felem_y_out, felem_z_out,
	1210	felem_x_in, felem_y_in, felem_z_in);
	1211	felem_shrink(x_out, felem_x_out);
	1212	felem_shrink(y_out, felem_y_out);
	1213	felem_shrink(z_out, felem_z_out);
	1214	}
	1215
	1216	/* copy_conditional copies in to out iff mask is all ones. */
	1217	static void copy_conditional(felem out, const felem in, limb mask)
	1218	{
	1219	unsigned i;
	1220	for (i = 0; i < NLIMBS; ++i) {
	1221	const limb tmp = mask & (in[i] ^ out[i]);
	1222	out[i] ^= tmp;
	1223	}
	1224	}
	1225
	1226	/* copy_small_conditional copies in to out iff mask is all ones. */
	1227	static void copy_small_conditional(felem out, const smallfelem in, limb mask)
	1228	{
	1229	unsigned i;
	1230	const u64 mask64 = mask;
	1231	for (i = 0; i < NLIMBS; ++i) {
	1232	out[i] = ((limb) (in[i] & mask64)) \| (out[i] & ~mask);
	1233	}
	1234	}
	1235
	1236	/*-
	1237	* point_add calculates (x1, y1, z1) + (x2, y2, z2)
	1238	*
	1239	* The method is taken from:
	1240	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
	1241	* adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
	1242	*
	1243	* This function includes a branch for checking whether the two input points
	1244	* are equal, (while not equal to the point at infinity). This case never
	1245	* happens during single point multiplication, so there is no timing leak for
	1246	* ECDH or ECDSA signing.
	1247	*/
	1248	static void point_add(felem x3, felem y3, felem z3,
	1249	const felem x1, const felem y1, const felem z1,
	1250	const int mixed, const smallfelem x2,
	1251	const smallfelem y2, const smallfelem z2)
	1252	{
	1253	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
	1254	longfelem tmp, tmp2;
	1255	smallfelem small1, small2, small3, small4, small5;
	1256	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
	1257
	1258	felem_shrink(small3, z1);
	1259
	1260	z1_is_zero = smallfelem_is_zero(small3);
	1261	z2_is_zero = smallfelem_is_zero(z2);
	1262
	1263	/* ftmp = z1z1 = z1*2 /
	1264	smallfelem_square(tmp, small3);
	1265	felem_reduce(ftmp, tmp);
	1266	/* ftmp[i] < 2^101 */
	1267	felem_shrink(small1, ftmp);
	1268
	1269	if (!mixed) {
	1270	/* ftmp2 = z2z2 = z2*2 /
	1271	smallfelem_square(tmp, z2);
	1272	felem_reduce(ftmp2, tmp);
	1273	/* ftmp2[i] < 2^101 */
	1274	felem_shrink(small2, ftmp2);
	1275
	1276	felem_shrink(small5, x1);
	1277
	1278	/* u1 = ftmp3 = x1z2z2 /
	1279	smallfelem_mul(tmp, small5, small2);
	1280	felem_reduce(ftmp3, tmp);
	1281	/* ftmp3[i] < 2^101 */
	1282
	1283	/* ftmp5 = z1 + z2 */
	1284	felem_assign(ftmp5, z1);
	1285	felem_small_sum(ftmp5, z2);
	1286	/* ftmp5[i] < 2^107 */
	1287
	1288	/* ftmp5 = (z1 + z2)*2 - (z1z1 + z2z2) = 2z1z2 /
	1289	felem_square(tmp, ftmp5);
	1290	felem_reduce(ftmp5, tmp);
	1291	/* ftmp2 = z2z2 + z1z1 */
	1292	felem_sum(ftmp2, ftmp);
	1293	/* ftmp2[i] < 2^101 + 2^101 = 2^102 */
	1294	felem_diff(ftmp5, ftmp2);
	1295	/* ftmp5[i] < 2^105 + 2^101 < 2^106 */
	1296
	1297	/* ftmp2 = z2 * z2z2 */
	1298	smallfelem_mul(tmp, small2, z2);
	1299	felem_reduce(ftmp2, tmp);
	1300
	1301	/* s1 = ftmp2 = y1 * z2*3 /
	1302	felem_mul(tmp, y1, ftmp2);
	1303	felem_reduce(ftmp6, tmp);
	1304	/* ftmp6[i] < 2^101 */
	1305	} else {
	1306	/*
	1307	* We'll assume z2 = 1 (special case z2 = 0 is handled later)
	1308	*/
	1309
	1310	/* u1 = ftmp3 = x1z2z2 /
	1311	felem_assign(ftmp3, x1);
	1312	/* ftmp3[i] < 2^106 */
	1313
	1314	/* ftmp5 = 2z1z2 */
	1315	felem_assign(ftmp5, z1);
	1316	felem_scalar(ftmp5, 2);
	1317	/* ftmp5[i] < 22^106 = 2^107 /
	1318
	1319	/* s1 = ftmp2 = y1 * z2*3 /
	1320	felem_assign(ftmp6, y1);
	1321	/* ftmp6[i] < 2^106 */
	1322	}
	1323
	1324	/* u2 = x2z1z1 /
	1325	smallfelem_mul(tmp, x2, small1);
	1326	felem_reduce(ftmp4, tmp);
	1327
	1328	/* h = ftmp4 = u2 - u1 */
	1329	felem_diff_zero107(ftmp4, ftmp3);
	1330	/* ftmp4[i] < 2^107 + 2^101 < 2^108 */
	1331	felem_shrink(small4, ftmp4);
	1332
	1333	x_equal = smallfelem_is_zero(small4);
	1334
	1335	/* z_out = ftmp5 * h */
	1336	felem_small_mul(tmp, small4, ftmp5);
	1337	felem_reduce(z_out, tmp);
	1338	/* z_out[i] < 2^101 */
	1339
	1340	/* ftmp = z1 * z1z1 */
	1341	smallfelem_mul(tmp, small1, small3);
	1342	felem_reduce(ftmp, tmp);
	1343
	1344	/* s2 = tmp = y2 * z1*3 /
	1345	felem_small_mul(tmp, y2, ftmp);
	1346	felem_reduce(ftmp5, tmp);
	1347
	1348	/* r = ftmp5 = (s2 - s1)2 /
	1349	felem_diff_zero107(ftmp5, ftmp6);
	1350	/* ftmp5[i] < 2^107 + 2^107 = 2^108 */
	1351	felem_scalar(ftmp5, 2);
	1352	/* ftmp5[i] < 2^109 */
	1353	felem_shrink(small1, ftmp5);
	1354	y_equal = smallfelem_is_zero(small1);
	1355
	1356	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
	1357	point_double(x3, y3, z3, x1, y1, z1);
	1358	return;
	1359	}
	1360
	1361	/* I = ftmp = (2h)*2 /
	1362	felem_assign(ftmp, ftmp4);
	1363	felem_scalar(ftmp, 2);
	1364	/* ftmp[i] < 22^108 = 2^109 /
	1365	felem_square(tmp, ftmp);
	1366	felem_reduce(ftmp, tmp);
	1367
	1368	/* J = ftmp2 = h * I */
	1369	felem_mul(tmp, ftmp4, ftmp);
	1370	felem_reduce(ftmp2, tmp);
	1371
	1372	/* V = ftmp4 = U1 * I */
	1373	felem_mul(tmp, ftmp3, ftmp);
	1374	felem_reduce(ftmp4, tmp);
	1375
	1376	/* x_out = r*2 - J - 2V /
	1377	smallfelem_square(tmp, small1);
	1378	felem_reduce(x_out, tmp);
	1379	felem_assign(ftmp3, ftmp4);
	1380	felem_scalar(ftmp4, 2);
	1381	felem_sum(ftmp4, ftmp2);
	1382	/* ftmp4[i] < 22^101 + 2^101 < 2^103 /
	1383	felem_diff(x_out, ftmp4);
	1384	/* x_out[i] < 2^105 + 2^101 */
	1385
	1386	/* y_out = r(V-x_out) - 2 * s1 * J */
	1387	felem_diff_zero107(ftmp3, x_out);
	1388	/* ftmp3[i] < 2^107 + 2^101 < 2^108 */
	1389	felem_small_mul(tmp, small1, ftmp3);
	1390	felem_mul(tmp2, ftmp6, ftmp2);
	1391	longfelem_scalar(tmp2, 2);
	1392	/* tmp2[i] < 22^67 = 2^68 /
	1393	longfelem_diff(tmp, tmp2);
	1394	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
	1395	felem_reduce_zero105(y_out, tmp);
	1396	/* y_out[i] < 2^106 */
	1397
	1398	copy_small_conditional(x_out, x2, z1_is_zero);
	1399	copy_conditional(x_out, x1, z2_is_zero);
	1400	copy_small_conditional(y_out, y2, z1_is_zero);
	1401	copy_conditional(y_out, y1, z2_is_zero);
	1402	copy_small_conditional(z_out, z2, z1_is_zero);
	1403	copy_conditional(z_out, z1, z2_is_zero);
	1404	felem_assign(x3, x_out);
	1405	felem_assign(y3, y_out);
	1406	felem_assign(z3, z_out);
	1407	}
	1408
	1409	/*
	1410	* point_add_small is the same as point_add, except that it operates on
	1411	* smallfelems
	1412	*/
	1413	static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
	1414	smallfelem x1, smallfelem y1, smallfelem z1,
	1415	smallfelem x2, smallfelem y2, smallfelem z2)
	1416	{
	1417	felem felem_x3, felem_y3, felem_z3;
	1418	felem felem_x1, felem_y1, felem_z1;
	1419	smallfelem_expand(felem_x1, x1);
	1420	smallfelem_expand(felem_y1, y1);
	1421	smallfelem_expand(felem_z1, z1);
	1422	point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
	1423	x2, y2, z2);
	1424	felem_shrink(x3, felem_x3);
	1425	felem_shrink(y3, felem_y3);
	1426	felem_shrink(z3, felem_z3);
	1427	}
	1428
	1429	/*-
	1430	* Base point pre computation
	1431	* --------------------------
	1432	*
	1433	* Two different sorts of precomputed tables are used in the following code.
	1434	* Each contain various points on the curve, where each point is three field
	1435	* elements (x, y, z).
	1436	*
	1437	* For the base point table, z is usually 1 (0 for the point at infinity).
	1438	* This table has 2 * 16 elements, starting with the following:
	1439	* index \| bits \| point
	1440	* ------+---------+------------------------------
	1441	* 0 \| 0 0 0 0 \| 0G
	1442	* 1 \| 0 0 0 1 \| 1G
	1443	* 2 \| 0 0 1 0 \| 2^64G
	1444	* 3 \| 0 0 1 1 \| (2^64 + 1)G
	1445	* 4 \| 0 1 0 0 \| 2^128G
	1446	* 5 \| 0 1 0 1 \| (2^128 + 1)G
	1447	* 6 \| 0 1 1 0 \| (2^128 + 2^64)G
	1448	* 7 \| 0 1 1 1 \| (2^128 + 2^64 + 1)G
	1449	* 8 \| 1 0 0 0 \| 2^192G
	1450	* 9 \| 1 0 0 1 \| (2^192 + 1)G
	1451	* 10 \| 1 0 1 0 \| (2^192 + 2^64)G
	1452	* 11 \| 1 0 1 1 \| (2^192 + 2^64 + 1)G
	1453	* 12 \| 1 1 0 0 \| (2^192 + 2^128)G
	1454	* 13 \| 1 1 0 1 \| (2^192 + 2^128 + 1)G
	1455	* 14 \| 1 1 1 0 \| (2^192 + 2^128 + 2^64)G
	1456	* 15 \| 1 1 1 1 \| (2^192 + 2^128 + 2^64 + 1)G
	1457	* followed by a copy of this with each element multiplied by 2^32.
	1458	*
	1459	* The reason for this is so that we can clock bits into four different
	1460	* locations when doing simple scalar multiplies against the base point,
	1461	* and then another four locations using the second 16 elements.
	1462	*
	1463	* Tables for other points have table[i] = iG for i in 0 .. 16. */
	1464
	1465	/* gmul is the table of precomputed base points */
	1466	static const smallfelem gmul[2][16][3] = {
	1467	{{{0, 0, 0, 0},
	1468	{0, 0, 0, 0},
	1469	{0, 0, 0, 0}},
	1470	{{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
	1471	0x6b17d1f2e12c4247},
	1472	{0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
	1473	0x4fe342e2fe1a7f9b},
	1474	{1, 0, 0, 0}},
	1475	{{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
	1476	0x0fa822bc2811aaa5},
	1477	{0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
	1478	0xbff44ae8f5dba80d},
	1479	{1, 0, 0, 0}},
	1480	{{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
	1481	0x300a4bbc89d6726f},
	1482	{0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
	1483	0x72aac7e0d09b4644},
	1484	{1, 0, 0, 0}},
	1485	{{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
	1486	0x447d739beedb5e67},
	1487	{0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
	1488	0x2d4825ab834131ee},
	1489	{1, 0, 0, 0}},
	1490	{{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
	1491	0xef9519328a9c72ff},
	1492	{0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
	1493	0x611e9fc37dbb2c9b},
	1494	{1, 0, 0, 0}},
	1495	{{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
	1496	0x550663797b51f5d8},
	1497	{0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
	1498	0x157164848aecb851},
	1499	{1, 0, 0, 0}},
	1500	{{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
	1501	0xeb5d7745b21141ea},
	1502	{0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
	1503	0xeafd72ebdbecc17b},
	1504	{1, 0, 0, 0}},
	1505	{{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
	1506	0xa6d39677a7849276},
	1507	{0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
	1508	0x674f84749b0b8816},
	1509	{1, 0, 0, 0}},
	1510	{{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
	1511	0x4e769e7672c9ddad},
	1512	{0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
	1513	0x42b99082de830663},
	1514	{1, 0, 0, 0}},
	1515	{{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
	1516	0x78878ef61c6ce04d},
	1517	{0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
	1518	0xb6cb3f5d7b72c321},
	1519	{1, 0, 0, 0}},
	1520	{{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
	1521	0x0c88bc4d716b1287},
	1522	{0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
	1523	0xdd5ddea3f3901dc6},
	1524	{1, 0, 0, 0}},
	1525	{{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
	1526	0x68f344af6b317466},
	1527	{0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
	1528	0x31b9c405f8540a20},
	1529	{1, 0, 0, 0}},
	1530	{{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
	1531	0x4052bf4b6f461db9},
	1532	{0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
	1533	0xfecf4d5190b0fc61},
	1534	{1, 0, 0, 0}},
	1535	{{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
	1536	0x1eddbae2c802e41a},
	1537	{0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
	1538	0x43104d86560ebcfc},
	1539	{1, 0, 0, 0}},
	1540	{{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
	1541	0xb48e26b484f7a21c},
	1542	{0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
	1543	0xfac015404d4d3dab},
	1544	{1, 0, 0, 0}}},
	1545	{{{0, 0, 0, 0},
	1546	{0, 0, 0, 0},
	1547	{0, 0, 0, 0}},
	1548	{{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
	1549	0x7fe36b40af22af89},
	1550	{0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
	1551	0xe697d45825b63624},
	1552	{1, 0, 0, 0}},
	1553	{{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
	1554	0x4a5b506612a677a6},
	1555	{0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
	1556	0xeb13461ceac089f1},
	1557	{1, 0, 0, 0}},
	1558	{{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
	1559	0x0781b8291c6a220a},
	1560	{0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
	1561	0x690cde8df0151593},
	1562	{1, 0, 0, 0}},
	1563	{{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
	1564	0x8a535f566ec73617},
	1565	{0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
	1566	0x0455c08468b08bd7},
	1567	{1, 0, 0, 0}},
	1568	{{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
	1569	0x06bada7ab77f8276},
	1570	{0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
	1571	0x5b476dfd0e6cb18a},
	1572	{1, 0, 0, 0}},
	1573	{{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
	1574	0x3e29864e8a2ec908},
	1575	{0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
	1576	0x239b90ea3dc31e7e},
	1577	{1, 0, 0, 0}},
	1578	{{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
	1579	0x820f4dd949f72ff7},
	1580	{0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
	1581	0x140406ec783a05ec},
	1582	{1, 0, 0, 0}},
	1583	{{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
	1584	0x68f6b8542783dfee},
	1585	{0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
	1586	0xcbe1feba92e40ce6},
	1587	{1, 0, 0, 0}},
	1588	{{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
	1589	0xd0b2f94d2f420109},
	1590	{0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
	1591	0x971459828b0719e5},
	1592	{1, 0, 0, 0}},
	1593	{{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
	1594	0x961610004a866aba},
	1595	{0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
	1596	0x7acb9fadcee75e44},
	1597	{1, 0, 0, 0}},
	1598	{{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
	1599	0x24eb9acca333bf5b},
	1600	{0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
	1601	0x69f891c5acd079cc},
	1602	{1, 0, 0, 0}},
	1603	{{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
	1604	0xe51f547c5972a107},
	1605	{0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
	1606	0x1c309a2b25bb1387},
	1607	{1, 0, 0, 0}},
	1608	{{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
	1609	0x20b87b8aa2c4e503},
	1610	{0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
	1611	0xf5c6fa49919776be},
	1612	{1, 0, 0, 0}},
	1613	{{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
	1614	0x1ed7d1b9332010b9},
	1615	{0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
	1616	0x3a2b03f03217257a},
	1617	{1, 0, 0, 0}},
	1618	{{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
	1619	0x15fee545c78dd9f6},
	1620	{0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
	1621	0x4ab5b6b2b8753f81},
	1622	{1, 0, 0, 0}}}
	1623	};
	1624
	1625	/*
	1626	* select_point selects the \|idx\|th point from a precomputation table and
	1627	* copies it to out.
	1628	*/
	1629	static void select_point(const u64 idx, unsigned int size,
	1630	const smallfelem pre_comp[16][3], smallfelem out[3])
	1631	{
	1632	unsigned i, j;
	1633	u64 *outlimbs = &out[0][0];
	1634
	1635	memset(out, 0, sizeof(out) 3);
	1636
	1637	for (i = 0; i < size; i++) {
	1638	const u64 inlimbs = (u64 )&pre_comp[i][0][0];
	1639	u64 mask = i ^ idx;
	1640	mask \|= mask >> 4;
	1641	mask \|= mask >> 2;
	1642	mask \|= mask >> 1;
	1643	mask &= 1;
	1644	mask--;
	1645	for (j = 0; j < NLIMBS * 3; j++)
	1646	outlimbs[j] \|= inlimbs[j] & mask;
	1647	}
	1648	}
	1649
	1650	/* get_bit returns the \|i\|th bit in \|in\| */
	1651	static char get_bit(const felem_bytearray in, int i)
	1652	{
	1653	if ((i < 0) \|\| (i >= 256))
	1654	return 0;
	1655	return (in[i >> 3] >> (i & 7)) & 1;
	1656	}
	1657
	1658	/*
	1659	* Interleaved point multiplication using precomputed point multiples: The
	1660	* small point multiples 0P, 1P, ..., 17*P are in pre_comp[], the scalars
	1661	* in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
	1662	* generator, using certain (large) precomputed multiples in g_pre_comp.
	1663	* Output point (X, Y, Z) is stored in x_out, y_out, z_out
	1664	*/
	1665	static void batch_mul(felem x_out, felem y_out, felem z_out,
	1666	const felem_bytearray scalars[],
	1667	const unsigned num_points, const u8 *g_scalar,
	1668	const int mixed, const smallfelem pre_comp[][17][3],
	1669	const smallfelem g_pre_comp[2][16][3])
	1670	{
	1671	int i, skip;
	1672	unsigned num, gen_mul = (g_scalar != NULL);
	1673	felem nq[3], ftmp;
	1674	smallfelem tmp[3];
	1675	u64 bits;
	1676	u8 sign, digit;
	1677
	1678	/* set nq to the point at infinity */
	1679	memset(nq, 0, sizeof(nq));
	1680
	1681	/*
	1682	* Loop over all scalars msb-to-lsb, interleaving additions of multiples
	1683	* of the generator (two in each of the last 32 rounds) and additions of
	1684	* other points multiples (every 5th round).
	1685	*/
	1686	skip = 1; /* save two point operations in the first
	1687	* round */
	1688	for (i = (num_points ? 255 : 31); i >= 0; --i) {
	1689	/* double */
	1690	if (!skip)
	1691	point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
	1692
	1693	/* add multiples of the generator */
	1694	if (gen_mul && (i <= 31)) {
	1695	/* first, look 32 bits upwards */
	1696	bits = get_bit(g_scalar, i + 224) << 3;
	1697	bits \|= get_bit(g_scalar, i + 160) << 2;
	1698	bits \|= get_bit(g_scalar, i + 96) << 1;
	1699	bits \|= get_bit(g_scalar, i + 32);
	1700	/* select the point to add, in constant time */
	1701	select_point(bits, 16, g_pre_comp[1], tmp);
	1702
	1703	if (!skip) {
	1704	/* Arg 1 below is for "mixed" */
	1705	point_add(nq[0], nq[1], nq[2],
	1706	nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
	1707	} else {
	1708	smallfelem_expand(nq[0], tmp[0]);
	1709	smallfelem_expand(nq[1], tmp[1]);
	1710	smallfelem_expand(nq[2], tmp[2]);
	1711	skip = 0;
	1712	}
	1713
	1714	/* second, look at the current position */
	1715	bits = get_bit(g_scalar, i + 192) << 3;
	1716	bits \|= get_bit(g_scalar, i + 128) << 2;
	1717	bits \|= get_bit(g_scalar, i + 64) << 1;
	1718	bits \|= get_bit(g_scalar, i);
	1719	/* select the point to add, in constant time */
	1720	select_point(bits, 16, g_pre_comp[0], tmp);
	1721	/* Arg 1 below is for "mixed" */
	1722	point_add(nq[0], nq[1], nq[2],
	1723	nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
	1724	}
	1725
	1726	/* do other additions every 5 doublings */
	1727	if (num_points && (i % 5 == 0)) {
	1728	/* loop over all scalars */
	1729	for (num = 0; num < num_points; ++num) {
	1730	bits = get_bit(scalars[num], i + 4) << 5;
	1731	bits \|= get_bit(scalars[num], i + 3) << 4;
	1732	bits \|= get_bit(scalars[num], i + 2) << 3;
	1733	bits \|= get_bit(scalars[num], i + 1) << 2;
	1734	bits \|= get_bit(scalars[num], i) << 1;
	1735	bits \|= get_bit(scalars[num], i - 1);
	1736	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
	1737
	1738	/*
	1739	* select the point to add or subtract, in constant time
	1740	*/
	1741	select_point(digit, 17, pre_comp[num], tmp);
	1742	smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
	1743	* point */
	1744	copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
	1745	felem_contract(tmp[1], ftmp);
	1746
	1747	if (!skip) {
	1748	point_add(nq[0], nq[1], nq[2],
	1749	nq[0], nq[1], nq[2],
	1750	mixed, tmp[0], tmp[1], tmp[2]);
	1751	} else {
	1752	smallfelem_expand(nq[0], tmp[0]);
	1753	smallfelem_expand(nq[1], tmp[1]);
	1754	smallfelem_expand(nq[2], tmp[2]);
	1755	skip = 0;
	1756	}
	1757	}
	1758	}
	1759	}
	1760	felem_assign(x_out, nq[0]);
	1761	felem_assign(y_out, nq[1]);
	1762	felem_assign(z_out, nq[2]);
	1763	}
	1764
	1765	/* Precomputation for the group generator. */
	1766	struct nistp256_pre_comp_st {
	1767	smallfelem g_pre_comp[2][16][3];
	1768	int references;
	1769	CRYPTO_RWLOCK *lock;
	1770	};
	1771
	1772	const EC_METHOD *EC_GFp_nistp256_method(void)
	1773	{
	1774	static const EC_METHOD ret = {
	1775	EC_FLAGS_DEFAULT_OCT,
	1776	NID_X9_62_prime_field,
	1777	ec_GFp_nistp256_group_init,
	1778	ec_GFp_simple_group_finish,
	1779	ec_GFp_simple_group_clear_finish,
	1780	ec_GFp_nist_group_copy,
	1781	ec_GFp_nistp256_group_set_curve,
	1782	ec_GFp_simple_group_get_curve,
	1783	ec_GFp_simple_group_get_degree,
	1784	ec_group_simple_order_bits,
	1785	ec_GFp_simple_group_check_discriminant,
	1786	ec_GFp_simple_point_init,
	1787	ec_GFp_simple_point_finish,
	1788	ec_GFp_simple_point_clear_finish,
	1789	ec_GFp_simple_point_copy,
	1790	ec_GFp_simple_point_set_to_infinity,
	1791	ec_GFp_simple_set_Jprojective_coordinates_GFp,
	1792	ec_GFp_simple_get_Jprojective_coordinates_GFp,
	1793	ec_GFp_simple_point_set_affine_coordinates,
	1794	ec_GFp_nistp256_point_get_affine_coordinates,
	1795	0 /* point_set_compressed_coordinates */ ,
	1796	0 /* point2oct */ ,
	1797	0 /* oct2point */ ,
	1798	ec_GFp_simple_add,
	1799	ec_GFp_simple_dbl,
	1800	ec_GFp_simple_invert,
	1801	ec_GFp_simple_is_at_infinity,
	1802	ec_GFp_simple_is_on_curve,
	1803	ec_GFp_simple_cmp,
	1804	ec_GFp_simple_make_affine,
	1805	ec_GFp_simple_points_make_affine,
	1806	ec_GFp_nistp256_points_mul,
	1807	ec_GFp_nistp256_precompute_mult,
	1808	ec_GFp_nistp256_have_precompute_mult,
	1809	ec_GFp_nist_field_mul,
	1810	ec_GFp_nist_field_sqr,
	1811	0 /* field_div */ ,
	1812	0 /* field_encode */ ,
	1813	0 /* field_decode */ ,
	1814	0, /* field_set_to_one */
	1815	ec_key_simple_priv2oct,
	1816	ec_key_simple_oct2priv,
	1817	0, /* set private */
	1818	ec_key_simple_generate_key,
	1819	ec_key_simple_check_key,
	1820	ec_key_simple_generate_public_key,
	1821	0, /* keycopy */
	1822	0, /* keyfinish */
	1823	ecdh_simple_compute_key
	1824	};
	1825
	1826	return &ret;
	1827	}
	1828
	1829	/******************************************************************************/
	1830	/*
	1831	* FUNCTIONS TO MANAGE PRECOMPUTATION
	1832	*/
	1833
	1834	static NISTP256_PRE_COMP *nistp256_pre_comp_new()
	1835	{
	1836	NISTP256_PRE_COMP ret = OPENSSL_zalloc(sizeof(ret));
	1837
	1838	if (ret == NULL) {
	1839	ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
	1840	return ret;
	1841	}
	1842
	1843	ret->references = 1;
	1844
	1845	ret->lock = CRYPTO_THREAD_lock_new();
	1846	if (ret->lock == NULL) {
	1847	ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
	1848	OPENSSL_free(ret);
	1849	return NULL;
	1850	}
	1851	return ret;
	1852	}
	1853
	1854	NISTP256_PRE_COMP EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP p)
	1855	{
	1856	int i;
	1857	if (p != NULL)
	1858	CRYPTO_atomic_add(&p->references, 1, &i, p->lock);
	1859	return p;
	1860	}
	1861
	1862	void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
	1863	{
	1864	int i;
	1865
	1866	if (pre == NULL)
	1867	return;
	1868
	1869	CRYPTO_atomic_add(&pre->references, -1, &i, pre->lock);
	1870	REF_PRINT_COUNT("EC_nistp256", x);
	1871	if (i > 0)
	1872	return;
	1873	REF_ASSERT_ISNT(i < 0);
	1874
	1875	CRYPTO_THREAD_lock_free(pre->lock);
	1876	OPENSSL_free(pre);
	1877	}
	1878
	1879	/******************************************************************************/
	1880	/*
	1881	* OPENSSL EC_METHOD FUNCTIONS
	1882	*/
	1883
	1884	int ec_GFp_nistp256_group_init(EC_GROUP *group)
	1885	{
	1886	int ret;
	1887	ret = ec_GFp_simple_group_init(group);
	1888	group->a_is_minus3 = 1;
	1889	return ret;
	1890	}
	1891
	1892	int ec_GFp_nistp256_group_set_curve(EC_GROUP group, const BIGNUM p,
	1893	const BIGNUM a, const BIGNUM b,
	1894	BN_CTX *ctx)
	1895	{
	1896	int ret = 0;
	1897	BN_CTX *new_ctx = NULL;
	1898	BIGNUM curve_p, curve_a, *curve_b;
	1899
	1900	if (ctx == NULL)
	1901	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
	1902	return 0;
	1903	BN_CTX_start(ctx);
	1904	if (((curve_p = BN_CTX_get(ctx)) == NULL) \|\|
	1905	((curve_a = BN_CTX_get(ctx)) == NULL) \|\|
	1906	((curve_b = BN_CTX_get(ctx)) == NULL))
	1907	goto err;
	1908	BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
	1909	BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
	1910	BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
	1911	if ((BN_cmp(curve_p, p)) \|\| (BN_cmp(curve_a, a)) \|\| (BN_cmp(curve_b, b))) {
	1912	ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
	1913	EC_R_WRONG_CURVE_PARAMETERS);
	1914	goto err;
	1915	}
	1916	group->field_mod_func = BN_nist_mod_256;
	1917	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
	1918	err:
	1919	BN_CTX_end(ctx);
	1920	BN_CTX_free(new_ctx);
	1921	return ret;
	1922	}
	1923
	1924	/*
	1925	* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
	1926	* (X/Z^2, Y/Z^3)
	1927	*/
	1928	int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
	1929	const EC_POINT *point,
	1930	BIGNUM x, BIGNUM y,
	1931	BN_CTX *ctx)
	1932	{
	1933	felem z1, z2, x_in, y_in;
	1934	smallfelem x_out, y_out;
	1935	longfelem tmp;
	1936
	1937	if (EC_POINT_is_at_infinity(group, point)) {
	1938	ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
	1939	EC_R_POINT_AT_INFINITY);
	1940	return 0;
	1941	}
	1942	if ((!BN_to_felem(x_in, point->X)) \|\| (!BN_to_felem(y_in, point->Y)) \|\|
	1943	(!BN_to_felem(z1, point->Z)))
	1944	return 0;
	1945	felem_inv(z2, z1);
	1946	felem_square(tmp, z2);
	1947	felem_reduce(z1, tmp);
	1948	felem_mul(tmp, x_in, z1);
	1949	felem_reduce(x_in, tmp);
	1950	felem_contract(x_out, x_in);
	1951	if (x != NULL) {
	1952	if (!smallfelem_to_BN(x, x_out)) {
	1953	ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
	1954	ERR_R_BN_LIB);
	1955	return 0;
	1956	}
	1957	}
	1958	felem_mul(tmp, z1, z2);
	1959	felem_reduce(z1, tmp);
	1960	felem_mul(tmp, y_in, z1);
	1961	felem_reduce(y_in, tmp);
	1962	felem_contract(y_out, y_in);
	1963	if (y != NULL) {
	1964	if (!smallfelem_to_BN(y, y_out)) {
	1965	ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
	1966	ERR_R_BN_LIB);
	1967	return 0;
	1968	}
	1969	}
	1970	return 1;
	1971	}
	1972
	1973	/* points below is of size \|num\|, and tmp_smallfelems is of size \|num+1\| */
	1974	static void make_points_affine(size_t num, smallfelem points[][3],
	1975	smallfelem tmp_smallfelems[])
	1976	{
	1977	/*
	1978	* Runs in constant time, unless an input is the point at infinity (which
	1979	* normally shouldn't happen).
	1980	*/
	1981	ec_GFp_nistp_points_make_affine_internal(num,
	1982	points,
	1983	sizeof(smallfelem),
	1984	tmp_smallfelems,
	1985	(void ()(void ))smallfelem_one,
	1986	(int ()(const void ))
	1987	smallfelem_is_zero_int,
	1988	(void ()(void , const void *))
	1989	smallfelem_assign,
	1990	(void ()(void , const void *))
	1991	smallfelem_square_contract,
	1992	(void (*)
	1993	(void , const void ,
	1994	const void *))
	1995	smallfelem_mul_contract,
	1996	(void ()(void , const void *))
	1997	smallfelem_inv_contract,
	1998	/* nothing to contract */
	1999	(void ()(void , const void *))
	2000	smallfelem_assign);
	2001	}
	2002
	2003	/*
	2004	* Computes scalargenerator + \sum scalars[i]points[i], ignoring NULL
	2005	* values Result is stored in r (r can equal one of the inputs).
	2006	*/
	2007	int ec_GFp_nistp256_points_mul(const EC_GROUP group, EC_POINT r,
	2008	const BIGNUM *scalar, size_t num,
	2009	const EC_POINT *points[],
	2010	const BIGNUM scalars[], BN_CTX ctx)
	2011	{
	2012	int ret = 0;
	2013	int j;
	2014	int mixed = 0;
	2015	BN_CTX *new_ctx = NULL;
	2016	BIGNUM x, y, z, tmp_scalar;
	2017	felem_bytearray g_secret;
	2018	felem_bytearray *secrets = NULL;
	2019	smallfelem (*pre_comp)[17][3] = NULL;
	2020	smallfelem *tmp_smallfelems = NULL;
	2021	felem_bytearray tmp;
	2022	unsigned i, num_bytes;
	2023	int have_pre_comp = 0;
	2024	size_t num_points = num;
	2025	smallfelem x_in, y_in, z_in;
	2026	felem x_out, y_out, z_out;
	2027	NISTP256_PRE_COMP *pre = NULL;
	2028	const smallfelem(*g_pre_comp)[16][3] = NULL;
	2029	EC_POINT *generator = NULL;
	2030	const EC_POINT *p = NULL;
	2031	const BIGNUM *p_scalar = NULL;
	2032
	2033	if (ctx == NULL)
	2034	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
	2035	return 0;
	2036	BN_CTX_start(ctx);
	2037	if (((x = BN_CTX_get(ctx)) == NULL) \|\|
	2038	((y = BN_CTX_get(ctx)) == NULL) \|\|
	2039	((z = BN_CTX_get(ctx)) == NULL) \|\|
	2040	((tmp_scalar = BN_CTX_get(ctx)) == NULL))
	2041	goto err;
	2042
	2043	if (scalar != NULL) {
	2044	pre = group->pre_comp.nistp256;
	2045	if (pre)
	2046	/* we have precomputation, try to use it */
	2047	g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
	2048	else
	2049	/* try to use the standard precomputation */
	2050	g_pre_comp = &gmul[0];
	2051	generator = EC_POINT_new(group);
	2052	if (generator == NULL)
	2053	goto err;
	2054	/* get the generator from precomputation */
	2055	if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) \|\|
	2056	!smallfelem_to_BN(y, g_pre_comp[0][1][1]) \|\|
	2057	!smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
	2058	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
	2059	goto err;
	2060	}
	2061	if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
	2062	generator, x, y, z,
	2063	ctx))
	2064	goto err;
	2065	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
	2066	/* precomputation matches generator */
	2067	have_pre_comp = 1;
	2068	else
	2069	/*
	2070	* we don't have valid precomputation: treat the generator as a
	2071	* random point
	2072	*/
	2073	num_points++;
	2074	}
	2075	if (num_points > 0) {
	2076	if (num_points >= 3) {
	2077	/*
	2078	* unless we precompute multiples for just one or two points,
	2079	* converting those into affine form is time well spent
	2080	*/
	2081	mixed = 1;
	2082	}
	2083	secrets = OPENSSL_malloc(sizeof(secrets) num_points);
	2084	pre_comp = OPENSSL_malloc(sizeof(pre_comp) num_points);
	2085	if (mixed)
	2086	tmp_smallfelems =
	2087	OPENSSL_malloc(sizeof(tmp_smallfelems) (num_points * 17 + 1));
	2088	if ((secrets == NULL) \|\| (pre_comp == NULL)
	2089	\|\| (mixed && (tmp_smallfelems == NULL))) {
	2090	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
	2091	goto err;
	2092	}
	2093
	2094	/*
	2095	* we treat NULL scalars as 0, and NULL points as points at infinity,
	2096	* i.e., they contribute nothing to the linear combination
	2097	*/
	2098	memset(secrets, 0, sizeof(secrets) num_points);
	2099	memset(pre_comp, 0, sizeof(pre_comp) num_points);
	2100	for (i = 0; i < num_points; ++i) {
	2101	if (i == num)
	2102	/*
	2103	* we didn't have a valid precomputation, so we pick the
	2104	* generator
	2105	*/
	2106	{
	2107	p = EC_GROUP_get0_generator(group);
	2108	p_scalar = scalar;
	2109	} else
	2110	/* the i^th point */
	2111	{
	2112	p = points[i];
	2113	p_scalar = scalars[i];
	2114	}
	2115	if ((p_scalar != NULL) && (p != NULL)) {
	2116	/* reduce scalar to 0 <= scalar < 2^256 */
	2117	if ((BN_num_bits(p_scalar) > 256)
	2118	\|\| (BN_is_negative(p_scalar))) {
	2119	/*
	2120	* this is an unusual input, and we don't guarantee
	2121	* constant-timeness
	2122	*/
	2123	if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
	2124	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
	2125	goto err;
	2126	}
	2127	num_bytes = BN_bn2bin(tmp_scalar, tmp);
	2128	} else
	2129	num_bytes = BN_bn2bin(p_scalar, tmp);
	2130	flip_endian(secrets[i], tmp, num_bytes);
	2131	/* precompute multiples */
	2132	if ((!BN_to_felem(x_out, p->X)) \|\|
	2133	(!BN_to_felem(y_out, p->Y)) \|\|
	2134	(!BN_to_felem(z_out, p->Z)))
	2135	goto err;
	2136	felem_shrink(pre_comp[i][1][0], x_out);
	2137	felem_shrink(pre_comp[i][1][1], y_out);
	2138	felem_shrink(pre_comp[i][1][2], z_out);
	2139	for (j = 2; j <= 16; ++j) {
	2140	if (j & 1) {
	2141	point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
	2142	pre_comp[i][j][2], pre_comp[i][1][0],
	2143	pre_comp[i][1][1], pre_comp[i][1][2],
	2144	pre_comp[i][j - 1][0],
	2145	pre_comp[i][j - 1][1],
	2146	pre_comp[i][j - 1][2]);
	2147	} else {
	2148	point_double_small(pre_comp[i][j][0],
	2149	pre_comp[i][j][1],
	2150	pre_comp[i][j][2],
	2151	pre_comp[i][j / 2][0],
	2152	pre_comp[i][j / 2][1],
	2153	pre_comp[i][j / 2][2]);
	2154	}
	2155	}
	2156	}
	2157	}
	2158	if (mixed)
	2159	make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
	2160	}
	2161
	2162	/* the scalar for the generator */
	2163	if ((scalar != NULL) && (have_pre_comp)) {
	2164	memset(g_secret, 0, sizeof(g_secret));
	2165	/* reduce scalar to 0 <= scalar < 2^256 */
	2166	if ((BN_num_bits(scalar) > 256) \|\| (BN_is_negative(scalar))) {
	2167	/*
	2168	* this is an unusual input, and we don't guarantee
	2169	* constant-timeness
	2170	*/
	2171	if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
	2172	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
	2173	goto err;
	2174	}
	2175	num_bytes = BN_bn2bin(tmp_scalar, tmp);
	2176	} else
	2177	num_bytes = BN_bn2bin(scalar, tmp);
	2178	flip_endian(g_secret, tmp, num_bytes);
	2179	/* do the multiplication with generator precomputation */
	2180	batch_mul(x_out, y_out, z_out,
	2181	(const felem_bytearray(*))secrets, num_points,
	2182	g_secret,
	2183	mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
	2184	} else
	2185	/* do the multiplication without generator precomputation */
	2186	batch_mul(x_out, y_out, z_out,
	2187	(const felem_bytearray(*))secrets, num_points,
	2188	NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
	2189	/* reduce the output to its unique minimal representation */
	2190	felem_contract(x_in, x_out);
	2191	felem_contract(y_in, y_out);
	2192	felem_contract(z_in, z_out);
	2193	if ((!smallfelem_to_BN(x, x_in)) \|\| (!smallfelem_to_BN(y, y_in)) \|\|
	2194	(!smallfelem_to_BN(z, z_in))) {
	2195	ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
	2196	goto err;
	2197	}
	2198	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
	2199
	2200	err:
	2201	BN_CTX_end(ctx);
	2202	EC_POINT_free(generator);
	2203	BN_CTX_free(new_ctx);
	2204	OPENSSL_free(secrets);
	2205	OPENSSL_free(pre_comp);
	2206	OPENSSL_free(tmp_smallfelems);
	2207	return ret;
	2208	}
	2209
	2210	int ec_GFp_nistp256_precompute_mult(EC_GROUP group, BN_CTX ctx)
	2211	{
	2212	int ret = 0;
	2213	NISTP256_PRE_COMP *pre = NULL;
	2214	int i, j;
	2215	BN_CTX *new_ctx = NULL;
	2216	BIGNUM x, y;
	2217	EC_POINT *generator = NULL;
	2218	smallfelem tmp_smallfelems[32];
	2219	felem x_tmp, y_tmp, z_tmp;
	2220
	2221	/* throw away old precomputation */
	2222	EC_pre_comp_free(group);
	2223	if (ctx == NULL)
	2224	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
	2225	return 0;
	2226	BN_CTX_start(ctx);
	2227	if (((x = BN_CTX_get(ctx)) == NULL) \|\| ((y = BN_CTX_get(ctx)) == NULL))
	2228	goto err;
	2229	/* get the generator */
	2230	if (group->generator == NULL)
	2231	goto err;
	2232	generator = EC_POINT_new(group);
	2233	if (generator == NULL)
	2234	goto err;
	2235	BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
	2236	BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
	2237	if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
	2238	goto err;
	2239	if ((pre = nistp256_pre_comp_new()) == NULL)
	2240	goto err;
	2241	/*
	2242	* if the generator is the standard one, use built-in precomputation
	2243	*/
	2244	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
	2245	memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
	2246	goto done;
	2247	}
	2248	if ((!BN_to_felem(x_tmp, group->generator->X)) \|\|
	2249	(!BN_to_felem(y_tmp, group->generator->Y)) \|\|
	2250	(!BN_to_felem(z_tmp, group->generator->Z)))
	2251	goto err;
	2252	felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
	2253	felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
	2254	felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
	2255	/*
	2256	* compute 2^64G, 2^128G, 2^192G for the first table, 2^32G, 2^96*G,
	2257	* 2^160G, 2^224G for the second one
	2258	*/
	2259	for (i = 1; i <= 8; i <<= 1) {
	2260	point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
	2261	pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
	2262	pre->g_pre_comp[0][i][1],
	2263	pre->g_pre_comp[0][i][2]);
	2264	for (j = 0; j < 31; ++j) {
	2265	point_double_small(pre->g_pre_comp[1][i][0],
	2266	pre->g_pre_comp[1][i][1],
	2267	pre->g_pre_comp[1][i][2],
	2268	pre->g_pre_comp[1][i][0],
	2269	pre->g_pre_comp[1][i][1],
	2270	pre->g_pre_comp[1][i][2]);
	2271	}
	2272	if (i == 8)
	2273	break;
	2274	point_double_small(pre->g_pre_comp[0][2 * i][0],
	2275	pre->g_pre_comp[0][2 * i][1],
	2276	pre->g_pre_comp[0][2 * i][2],
	2277	pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
	2278	pre->g_pre_comp[1][i][2]);
	2279	for (j = 0; j < 31; ++j) {
	2280	point_double_small(pre->g_pre_comp[0][2 * i][0],
	2281	pre->g_pre_comp[0][2 * i][1],
	2282	pre->g_pre_comp[0][2 * i][2],
	2283	pre->g_pre_comp[0][2 * i][0],
	2284	pre->g_pre_comp[0][2 * i][1],
	2285	pre->g_pre_comp[0][2 * i][2]);
	2286	}
	2287	}
	2288	for (i = 0; i < 2; i++) {
	2289	/* g_pre_comp[i][0] is the point at infinity */
	2290	memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
	2291	/* the remaining multiples */
	2292	/* 2^64G + 2^128G resp. 2^96G + 2^160G */
	2293	point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
	2294	pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
	2295	pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
	2296	pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
	2297	pre->g_pre_comp[i][2][2]);
	2298	/* 2^64G + 2^192G resp. 2^96G + 2^224G */
	2299	point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
	2300	pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
	2301	pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
	2302	pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
	2303	pre->g_pre_comp[i][2][2]);
	2304	/* 2^128G + 2^192G resp. 2^160G + 2^224G */
	2305	point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
	2306	pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
	2307	pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
	2308	pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
	2309	pre->g_pre_comp[i][4][2]);
	2310	/*
	2311	* 2^64G + 2^128G + 2^192G resp. 2^96G + 2^160G + 2^224G
	2312	*/
	2313	point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
	2314	pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
	2315	pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
	2316	pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
	2317	pre->g_pre_comp[i][2][2]);
	2318	for (j = 1; j < 8; ++j) {
	2319	/* odd multiples: add G resp. 2^32G /
	2320	point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
	2321	pre->g_pre_comp[i][2 * j + 1][1],
	2322	pre->g_pre_comp[i][2 * j + 1][2],
	2323	pre->g_pre_comp[i][2 * j][0],
	2324	pre->g_pre_comp[i][2 * j][1],
	2325	pre->g_pre_comp[i][2 * j][2],
	2326	pre->g_pre_comp[i][1][0],
	2327	pre->g_pre_comp[i][1][1],
	2328	pre->g_pre_comp[i][1][2]);
	2329	}
	2330	}
	2331	make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
	2332
	2333	done:
	2334	SETPRECOMP(group, nistp256, pre);
	2335	pre = NULL;
	2336	ret = 1;
	2337
	2338	err:
	2339	BN_CTX_end(ctx);
	2340	EC_POINT_free(generator);
	2341	BN_CTX_free(new_ctx);
	2342	EC_nistp256_pre_comp_free(pre);
	2343	return ret;
	2344	}
	2345
	2346	int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
	2347	{
	2348	return HAVEPRECOMP(group, nistp256);
	2349	}
	2350	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: EcnlProtoTool/trunk/openssl-1.1.0e/crypto/ec/ecp_nistp256.c@ 380

Download in other formats: