Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

ecp_nistp521.c@ 331

Last change on this file since 331 was 331, checked in by coas-nagasima, 6 years ago
prototoolに関連するプロジェクトをnewlibからmuslを使うよう変更・更新 ntshellをnewlibの下位の実装から、muslのsyscallの実装に変更・更新以下のOSSをアップデート・mruby-1.3.0 ・musl-1.1.18 ・onigmo-6.1.3 ・tcc-0.9.27 以下のOSSを追加・openssl-1.1.0e ・curl-7.57.0 ・zlib-1.2.11 以下のmrbgemsを追加・iij/mruby-digest ・iij/mruby-env ・iij/mruby-errno ・iij/mruby-iijson ・iij/mruby-ipaddr ・iij/mruby-mock ・iij/mruby-require ・iij/mruby-tls-openssl
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc`
File size: 70.2 KB

Rev	Line
[331]	1	/*
	2	* Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
	3	*
	4	* Licensed under the OpenSSL license (the "License"). You may not use
	5	* this file except in compliance with the License. You can obtain a copy
	6	* in the file LICENSE in the source distribution or at
	7	* https://www.openssl.org/source/license.html
	8	*/
	9
	10	/* Copyright 2011 Google Inc.
	11	*
	12	* Licensed under the Apache License, Version 2.0 (the "License");
	13	*
	14	* you may not use this file except in compliance with the License.
	15	* You may obtain a copy of the License at
	16	*
	17	* http://www.apache.org/licenses/LICENSE-2.0
	18	*
	19	* Unless required by applicable law or agreed to in writing, software
	20	* distributed under the License is distributed on an "AS IS" BASIS,
	21	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	22	* See the License for the specific language governing permissions and
	23	* limitations under the License.
	24	*/
	25
	26	/*
	27	* A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
	28	*
	29	* OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
	30	* Otherwise based on Emilia's P224 work, which was inspired by my curve25519
	31	* work which got its smarts from Daniel J. Bernstein's work on the same.
	32	*/
	33
	34	#include <openssl/opensslconf.h>
	35	#ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
	36	NON_EMPTY_TRANSLATION_UNIT
	37	#else
	38
	39	# ifndef OPENSSL_SYS_VMS
	40	# include <stdint.h>
	41	# else
	42	# include <inttypes.h>
	43	# endif
	44
	45	# include <string.h>
	46	# include <openssl/err.h>
	47	# include "ec_lcl.h"
	48
	49	# if defined(__GNUC__) && (__GNUC__ > 3 \|\| (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
	50	/* even with gcc, the typedef won't work for 32-bit platforms */
	51	typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
	52	* platforms */
	53	# else
	54	# error "Need GCC 3.1 or later to define type uint128_t"
	55	# endif
	56
	57	typedef uint8_t u8;
	58	typedef uint64_t u64;
	59	typedef int64_t s64;
	60
	61	/*
	62	* The underlying field. P521 operates over GF(2^521-1). We can serialise an
	63	* element of this field into 66 bytes where the most significant byte
	64	* contains only a single bit. We call this an felem_bytearray.
	65	*/
	66
	67	typedef u8 felem_bytearray[66];
	68
	69	/*
	70	* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
	71	* These values are big-endian.
	72	*/
	73	static const felem_bytearray nistp521_curve_params[5] = {
	74	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
	75	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	76	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	77	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	78	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	79	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	80	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	81	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	82	0xff, 0xff},
	83	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
	84	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	85	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	86	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	87	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	88	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	89	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	90	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
	91	0xff, 0xfc},
	92	{0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
	93	0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
	94	0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
	95	0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
	96	0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
	97	0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
	98	0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
	99	0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
	100	0x3f, 0x00},
	101	{0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
	102	0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
	103	0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
	104	0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
	105	0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
	106	0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
	107	0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
	108	0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
	109	0xbd, 0x66},
	110	{0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
	111	0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
	112	0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
	113	0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
	114	0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
	115	0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
	116	0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
	117	0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
	118	0x66, 0x50}
	119	};
	120
	121	/*-
	122	* The representation of field elements.
	123	* ------------------------------------
	124	*
	125	* We represent field elements with nine values. These values are either 64 or
	126	* 128 bits and the field element represented is:
	127	* v[0]2^0 + v[1]2^58 + v[2]2^116 + ... + v[8]2^464 (mod p)
	128	* Each of the nine values is called a 'limb'. Since the limbs are spaced only
	129	* 58 bits apart, but are greater than 58 bits in length, the most significant
	130	* bits of each limb overlap with the least significant bits of the next.
	131	*
	132	* A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
	133	* 'largefelem' */
	134
	135	# define NLIMBS 9
	136
	137	typedef uint64_t limb;
	138	typedef limb felem[NLIMBS];
	139	typedef uint128_t largefelem[NLIMBS];
	140
	141	static const limb bottom57bits = 0x1ffffffffffffff;
	142	static const limb bottom58bits = 0x3ffffffffffffff;
	143
	144	/*
	145	* bin66_to_felem takes a little-endian byte array and converts it into felem
	146	* form. This assumes that the CPU is little-endian.
	147	*/
	148	static void bin66_to_felem(felem out, const u8 in[66])
	149	{
	150	out[0] = (((limb ) & in[0])) & bottom58bits;
	151	out[1] = (((limb ) & in[7]) >> 2) & bottom58bits;
	152	out[2] = (((limb ) & in[14]) >> 4) & bottom58bits;
	153	out[3] = (((limb ) & in[21]) >> 6) & bottom58bits;
	154	out[4] = (((limb ) & in[29])) & bottom58bits;
	155	out[5] = (((limb ) & in[36]) >> 2) & bottom58bits;
	156	out[6] = (((limb ) & in[43]) >> 4) & bottom58bits;
	157	out[7] = (((limb ) & in[50]) >> 6) & bottom58bits;
	158	out[8] = (((limb ) & in[58])) & bottom57bits;
	159	}
	160
	161	/*
	162	* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
	163	* array. This assumes that the CPU is little-endian.
	164	*/
	165	static void felem_to_bin66(u8 out[66], const felem in)
	166	{
	167	memset(out, 0, 66);
	168	(((limb ) & out[0])) = in[0];
	169	(((limb ) & out[7])) \|= in[1] << 2;
	170	(((limb ) & out[14])) \|= in[2] << 4;
	171	(((limb ) & out[21])) \|= in[3] << 6;
	172	(((limb ) & out[29])) = in[4];
	173	(((limb ) & out[36])) \|= in[5] << 2;
	174	(((limb ) & out[43])) \|= in[6] << 4;
	175	(((limb ) & out[50])) \|= in[7] << 6;
	176	(((limb ) & out[58])) = in[8];
	177	}
	178
	179	/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
	180	static void flip_endian(u8 out, const u8 in, unsigned len)
	181	{
	182	unsigned i;
	183	for (i = 0; i < len; ++i)
	184	out[i] = in[len - 1 - i];
	185	}
	186
	187	/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
	188	static int BN_to_felem(felem out, const BIGNUM *bn)
	189	{
	190	felem_bytearray b_in;
	191	felem_bytearray b_out;
	192	unsigned num_bytes;
	193
	194	/* BN_bn2bin eats leading zeroes */
	195	memset(b_out, 0, sizeof(b_out));
	196	num_bytes = BN_num_bytes(bn);
	197	if (num_bytes > sizeof b_out) {
	198	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
	199	return 0;
	200	}
	201	if (BN_is_negative(bn)) {
	202	ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
	203	return 0;
	204	}
	205	num_bytes = BN_bn2bin(bn, b_in);
	206	flip_endian(b_out, b_in, num_bytes);
	207	bin66_to_felem(out, b_out);
	208	return 1;
	209	}
	210
	211	/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
	212	static BIGNUM felem_to_BN(BIGNUM out, const felem in)
	213	{
	214	felem_bytearray b_in, b_out;
	215	felem_to_bin66(b_in, in);
	216	flip_endian(b_out, b_in, sizeof b_out);
	217	return BN_bin2bn(b_out, sizeof b_out, out);
	218	}
	219
	220	/*-
	221	* Field operations
	222	* ----------------
	223	*/
	224
	225	static void felem_one(felem out)
	226	{
	227	out[0] = 1;
	228	out[1] = 0;
	229	out[2] = 0;
	230	out[3] = 0;
	231	out[4] = 0;
	232	out[5] = 0;
	233	out[6] = 0;
	234	out[7] = 0;
	235	out[8] = 0;
	236	}
	237
	238	static void felem_assign(felem out, const felem in)
	239	{
	240	out[0] = in[0];
	241	out[1] = in[1];
	242	out[2] = in[2];
	243	out[3] = in[3];
	244	out[4] = in[4];
	245	out[5] = in[5];
	246	out[6] = in[6];
	247	out[7] = in[7];
	248	out[8] = in[8];
	249	}
	250
	251	/* felem_sum64 sets out = out + in. */
	252	static void felem_sum64(felem out, const felem in)
	253	{
	254	out[0] += in[0];
	255	out[1] += in[1];
	256	out[2] += in[2];
	257	out[3] += in[3];
	258	out[4] += in[4];
	259	out[5] += in[5];
	260	out[6] += in[6];
	261	out[7] += in[7];
	262	out[8] += in[8];
	263	}
	264
	265	/* felem_scalar sets out = in * scalar */
	266	static void felem_scalar(felem out, const felem in, limb scalar)
	267	{
	268	out[0] = in[0] * scalar;
	269	out[1] = in[1] * scalar;
	270	out[2] = in[2] * scalar;
	271	out[3] = in[3] * scalar;
	272	out[4] = in[4] * scalar;
	273	out[5] = in[5] * scalar;
	274	out[6] = in[6] * scalar;
	275	out[7] = in[7] * scalar;
	276	out[8] = in[8] * scalar;
	277	}
	278
	279	/* felem_scalar64 sets out = out * scalar */
	280	static void felem_scalar64(felem out, limb scalar)
	281	{
	282	out[0] *= scalar;
	283	out[1] *= scalar;
	284	out[2] *= scalar;
	285	out[3] *= scalar;
	286	out[4] *= scalar;
	287	out[5] *= scalar;
	288	out[6] *= scalar;
	289	out[7] *= scalar;
	290	out[8] *= scalar;
	291	}
	292
	293	/* felem_scalar128 sets out = out * scalar */
	294	static void felem_scalar128(largefelem out, limb scalar)
	295	{
	296	out[0] *= scalar;
	297	out[1] *= scalar;
	298	out[2] *= scalar;
	299	out[3] *= scalar;
	300	out[4] *= scalar;
	301	out[5] *= scalar;
	302	out[6] *= scalar;
	303	out[7] *= scalar;
	304	out[8] *= scalar;
	305	}
	306
	307	/*-
	308	* felem_neg sets \|out\| to \|-in\|
	309	* On entry:
	310	* in[i] < 2^59 + 2^14
	311	* On exit:
	312	* out[i] < 2^62
	313	*/
	314	static void felem_neg(felem out, const felem in)
	315	{
	316	/* In order to prevent underflow, we subtract from 0 mod p. */
	317	static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
	318	static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
	319
	320	out[0] = two62m3 - in[0];
	321	out[1] = two62m2 - in[1];
	322	out[2] = two62m2 - in[2];
	323	out[3] = two62m2 - in[3];
	324	out[4] = two62m2 - in[4];
	325	out[5] = two62m2 - in[5];
	326	out[6] = two62m2 - in[6];
	327	out[7] = two62m2 - in[7];
	328	out[8] = two62m2 - in[8];
	329	}
	330
	331	/*-
	332	* felem_diff64 subtracts \|in\| from \|out\|
	333	* On entry:
	334	* in[i] < 2^59 + 2^14
	335	* On exit:
	336	* out[i] < out[i] + 2^62
	337	*/
	338	static void felem_diff64(felem out, const felem in)
	339	{
	340	/*
	341	* In order to prevent underflow, we add 0 mod p before subtracting.
	342	*/
	343	static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
	344	static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
	345
	346	out[0] += two62m3 - in[0];
	347	out[1] += two62m2 - in[1];
	348	out[2] += two62m2 - in[2];
	349	out[3] += two62m2 - in[3];
	350	out[4] += two62m2 - in[4];
	351	out[5] += two62m2 - in[5];
	352	out[6] += two62m2 - in[6];
	353	out[7] += two62m2 - in[7];
	354	out[8] += two62m2 - in[8];
	355	}
	356
	357	/*-
	358	* felem_diff_128_64 subtracts \|in\| from \|out\|
	359	* On entry:
	360	* in[i] < 2^62 + 2^17
	361	* On exit:
	362	* out[i] < out[i] + 2^63
	363	*/
	364	static void felem_diff_128_64(largefelem out, const felem in)
	365	{
	366	/*
	367	* In order to prevent underflow, we add 0 mod p before subtracting.
	368	*/
	369	static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
	370	static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
	371
	372	out[0] += two63m6 - in[0];
	373	out[1] += two63m5 - in[1];
	374	out[2] += two63m5 - in[2];
	375	out[3] += two63m5 - in[3];
	376	out[4] += two63m5 - in[4];
	377	out[5] += two63m5 - in[5];
	378	out[6] += two63m5 - in[6];
	379	out[7] += two63m5 - in[7];
	380	out[8] += two63m5 - in[8];
	381	}
	382
	383	/*-
	384	* felem_diff_128_64 subtracts \|in\| from \|out\|
	385	* On entry:
	386	* in[i] < 2^126
	387	* On exit:
	388	* out[i] < out[i] + 2^127 - 2^69
	389	*/
	390	static void felem_diff128(largefelem out, const largefelem in)
	391	{
	392	/*
	393	* In order to prevent underflow, we add 0 mod p before subtracting.
	394	*/
	395	static const uint128_t two127m70 =
	396	(((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
	397	static const uint128_t two127m69 =
	398	(((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
	399
	400	out[0] += (two127m70 - in[0]);
	401	out[1] += (two127m69 - in[1]);
	402	out[2] += (two127m69 - in[2]);
	403	out[3] += (two127m69 - in[3]);
	404	out[4] += (two127m69 - in[4]);
	405	out[5] += (two127m69 - in[5]);
	406	out[6] += (two127m69 - in[6]);
	407	out[7] += (two127m69 - in[7]);
	408	out[8] += (two127m69 - in[8]);
	409	}
	410
	411	/*-
	412	* felem_square sets \|out\| = \|in\|^2
	413	* On entry:
	414	* in[i] < 2^62
	415	* On exit:
	416	* out[i] < 17 * max(in[i]) * max(in[i])
	417	*/
	418	static void felem_square(largefelem out, const felem in)
	419	{
	420	felem inx2, inx4;
	421	felem_scalar(inx2, in, 2);
	422	felem_scalar(inx4, in, 4);
	423
	424	/*-
	425	* We have many cases were we want to do
	426	* in[x] * in[y] +
	427	* in[y] * in[x]
	428	* This is obviously just
	429	* 2 * in[x] * in[y]
	430	* However, rather than do the doubling on the 128 bit result, we
	431	* double one of the inputs to the multiplication by reading from
	432	* \|inx2\|
	433	*/
	434
	435	out[0] = ((uint128_t) in[0]) * in[0];
	436	out[1] = ((uint128_t) in[0]) * inx2[1];
	437	out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
	438	out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
	439	out[4] = ((uint128_t) in[0]) * inx2[4] +
	440	((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
	441	out[5] = ((uint128_t) in[0]) * inx2[5] +
	442	((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
	443	out[6] = ((uint128_t) in[0]) * inx2[6] +
	444	((uint128_t) in[1]) * inx2[5] +
	445	((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
	446	out[7] = ((uint128_t) in[0]) * inx2[7] +
	447	((uint128_t) in[1]) * inx2[6] +
	448	((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
	449	out[8] = ((uint128_t) in[0]) * inx2[8] +
	450	((uint128_t) in[1]) * inx2[7] +
	451	((uint128_t) in[2]) * inx2[6] +
	452	((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
	453
	454	/*
	455	* The remaining limbs fall above 2^521, with the first falling at 2^522.
	456	* They correspond to locations one bit up from the limbs produced above
	457	* so we would have to multiply by two to align them. Again, rather than
	458	* operate on the 128-bit result, we double one of the inputs to the
	459	* multiplication. If we want to double for both this reason, and the
	460	* reason above, then we end up multiplying by four.
	461	*/
	462
	463	/* 9 */
	464	out[0] += ((uint128_t) in[1]) * inx4[8] +
	465	((uint128_t) in[2]) * inx4[7] +
	466	((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
	467
	468	/* 10 */
	469	out[1] += ((uint128_t) in[2]) * inx4[8] +
	470	((uint128_t) in[3]) * inx4[7] +
	471	((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
	472
	473	/* 11 */
	474	out[2] += ((uint128_t) in[3]) * inx4[8] +
	475	((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
	476
	477	/* 12 */
	478	out[3] += ((uint128_t) in[4]) * inx4[8] +
	479	((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
	480
	481	/* 13 */
	482	out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
	483
	484	/* 14 */
	485	out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
	486
	487	/* 15 */
	488	out[6] += ((uint128_t) in[7]) * inx4[8];
	489
	490	/* 16 */
	491	out[7] += ((uint128_t) in[8]) * inx2[8];
	492	}
	493
	494	/*-
	495	* felem_mul sets \|out\| = \|in1\| * \|in2\|
	496	* On entry:
	497	* in1[i] < 2^64
	498	* in2[i] < 2^63
	499	* On exit:
	500	* out[i] < 17 * max(in1[i]) * max(in2[i])
	501	*/
	502	static void felem_mul(largefelem out, const felem in1, const felem in2)
	503	{
	504	felem in2x2;
	505	felem_scalar(in2x2, in2, 2);
	506
	507	out[0] = ((uint128_t) in1[0]) * in2[0];
	508
	509	out[1] = ((uint128_t) in1[0]) * in2[1] +
	510	((uint128_t) in1[1]) * in2[0];
	511
	512	out[2] = ((uint128_t) in1[0]) * in2[2] +
	513	((uint128_t) in1[1]) * in2[1] +
	514	((uint128_t) in1[2]) * in2[0];
	515
	516	out[3] = ((uint128_t) in1[0]) * in2[3] +
	517	((uint128_t) in1[1]) * in2[2] +
	518	((uint128_t) in1[2]) * in2[1] +
	519	((uint128_t) in1[3]) * in2[0];
	520
	521	out[4] = ((uint128_t) in1[0]) * in2[4] +
	522	((uint128_t) in1[1]) * in2[3] +
	523	((uint128_t) in1[2]) * in2[2] +
	524	((uint128_t) in1[3]) * in2[1] +
	525	((uint128_t) in1[4]) * in2[0];
	526
	527	out[5] = ((uint128_t) in1[0]) * in2[5] +
	528	((uint128_t) in1[1]) * in2[4] +
	529	((uint128_t) in1[2]) * in2[3] +
	530	((uint128_t) in1[3]) * in2[2] +
	531	((uint128_t) in1[4]) * in2[1] +
	532	((uint128_t) in1[5]) * in2[0];
	533
	534	out[6] = ((uint128_t) in1[0]) * in2[6] +
	535	((uint128_t) in1[1]) * in2[5] +
	536	((uint128_t) in1[2]) * in2[4] +
	537	((uint128_t) in1[3]) * in2[3] +
	538	((uint128_t) in1[4]) * in2[2] +
	539	((uint128_t) in1[5]) * in2[1] +
	540	((uint128_t) in1[6]) * in2[0];
	541
	542	out[7] = ((uint128_t) in1[0]) * in2[7] +
	543	((uint128_t) in1[1]) * in2[6] +
	544	((uint128_t) in1[2]) * in2[5] +
	545	((uint128_t) in1[3]) * in2[4] +
	546	((uint128_t) in1[4]) * in2[3] +
	547	((uint128_t) in1[5]) * in2[2] +
	548	((uint128_t) in1[6]) * in2[1] +
	549	((uint128_t) in1[7]) * in2[0];
	550
	551	out[8] = ((uint128_t) in1[0]) * in2[8] +
	552	((uint128_t) in1[1]) * in2[7] +
	553	((uint128_t) in1[2]) * in2[6] +
	554	((uint128_t) in1[3]) * in2[5] +
	555	((uint128_t) in1[4]) * in2[4] +
	556	((uint128_t) in1[5]) * in2[3] +
	557	((uint128_t) in1[6]) * in2[2] +
	558	((uint128_t) in1[7]) * in2[1] +
	559	((uint128_t) in1[8]) * in2[0];
	560
	561	/* See comment in felem_square about the use of in2x2 here */
	562
	563	out[0] += ((uint128_t) in1[1]) * in2x2[8] +
	564	((uint128_t) in1[2]) * in2x2[7] +
	565	((uint128_t) in1[3]) * in2x2[6] +
	566	((uint128_t) in1[4]) * in2x2[5] +
	567	((uint128_t) in1[5]) * in2x2[4] +
	568	((uint128_t) in1[6]) * in2x2[3] +
	569	((uint128_t) in1[7]) * in2x2[2] +
	570	((uint128_t) in1[8]) * in2x2[1];
	571
	572	out[1] += ((uint128_t) in1[2]) * in2x2[8] +
	573	((uint128_t) in1[3]) * in2x2[7] +
	574	((uint128_t) in1[4]) * in2x2[6] +
	575	((uint128_t) in1[5]) * in2x2[5] +
	576	((uint128_t) in1[6]) * in2x2[4] +
	577	((uint128_t) in1[7]) * in2x2[3] +
	578	((uint128_t) in1[8]) * in2x2[2];
	579
	580	out[2] += ((uint128_t) in1[3]) * in2x2[8] +
	581	((uint128_t) in1[4]) * in2x2[7] +
	582	((uint128_t) in1[5]) * in2x2[6] +
	583	((uint128_t) in1[6]) * in2x2[5] +
	584	((uint128_t) in1[7]) * in2x2[4] +
	585	((uint128_t) in1[8]) * in2x2[3];
	586
	587	out[3] += ((uint128_t) in1[4]) * in2x2[8] +
	588	((uint128_t) in1[5]) * in2x2[7] +
	589	((uint128_t) in1[6]) * in2x2[6] +
	590	((uint128_t) in1[7]) * in2x2[5] +
	591	((uint128_t) in1[8]) * in2x2[4];
	592
	593	out[4] += ((uint128_t) in1[5]) * in2x2[8] +
	594	((uint128_t) in1[6]) * in2x2[7] +
	595	((uint128_t) in1[7]) * in2x2[6] +
	596	((uint128_t) in1[8]) * in2x2[5];
	597
	598	out[5] += ((uint128_t) in1[6]) * in2x2[8] +
	599	((uint128_t) in1[7]) * in2x2[7] +
	600	((uint128_t) in1[8]) * in2x2[6];
	601
	602	out[6] += ((uint128_t) in1[7]) * in2x2[8] +
	603	((uint128_t) in1[8]) * in2x2[7];
	604
	605	out[7] += ((uint128_t) in1[8]) * in2x2[8];
	606	}
	607
	608	static const limb bottom52bits = 0xfffffffffffff;
	609
	610	/*-
	611	* felem_reduce converts a largefelem to an felem.
	612	* On entry:
	613	* in[i] < 2^128
	614	* On exit:
	615	* out[i] < 2^59 + 2^14
	616	*/
	617	static void felem_reduce(felem out, const largefelem in)
	618	{
	619	u64 overflow1, overflow2;
	620
	621	out[0] = ((limb) in[0]) & bottom58bits;
	622	out[1] = ((limb) in[1]) & bottom58bits;
	623	out[2] = ((limb) in[2]) & bottom58bits;
	624	out[3] = ((limb) in[3]) & bottom58bits;
	625	out[4] = ((limb) in[4]) & bottom58bits;
	626	out[5] = ((limb) in[5]) & bottom58bits;
	627	out[6] = ((limb) in[6]) & bottom58bits;
	628	out[7] = ((limb) in[7]) & bottom58bits;
	629	out[8] = ((limb) in[8]) & bottom58bits;
	630
	631	/* out[i] < 2^58 */
	632
	633	out[1] += ((limb) in[0]) >> 58;
	634	out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
	635	/*-
	636	* out[1] < 2^58 + 2^6 + 2^58
	637	* = 2^59 + 2^6
	638	*/
	639	out[2] += ((limb) (in[0] >> 64)) >> 52;
	640
	641	out[2] += ((limb) in[1]) >> 58;
	642	out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
	643	out[3] += ((limb) (in[1] >> 64)) >> 52;
	644
	645	out[3] += ((limb) in[2]) >> 58;
	646	out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
	647	out[4] += ((limb) (in[2] >> 64)) >> 52;
	648
	649	out[4] += ((limb) in[3]) >> 58;
	650	out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
	651	out[5] += ((limb) (in[3] >> 64)) >> 52;
	652
	653	out[5] += ((limb) in[4]) >> 58;
	654	out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
	655	out[6] += ((limb) (in[4] >> 64)) >> 52;
	656
	657	out[6] += ((limb) in[5]) >> 58;
	658	out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
	659	out[7] += ((limb) (in[5] >> 64)) >> 52;
	660
	661	out[7] += ((limb) in[6]) >> 58;
	662	out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
	663	out[8] += ((limb) (in[6] >> 64)) >> 52;
	664
	665	out[8] += ((limb) in[7]) >> 58;
	666	out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
	667	/*-
	668	* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
	669	* < 2^59 + 2^13
	670	*/
	671	overflow1 = ((limb) (in[7] >> 64)) >> 52;
	672
	673	overflow1 += ((limb) in[8]) >> 58;
	674	overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
	675	overflow2 = ((limb) (in[8] >> 64)) >> 52;
	676
	677	overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
	678	overflow2 <<= 1; /* overflow2 < 2^13 */
	679
	680	out[0] += overflow1; /* out[0] < 2^60 */
	681	out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
	682
	683	out[1] += out[0] >> 58;
	684	out[0] &= bottom58bits;
	685	/*-
	686	* out[0] < 2^58
	687	* out[1] < 2^59 + 2^6 + 2^13 + 2^2
	688	* < 2^59 + 2^14
	689	*/
	690	}
	691
	692	static void felem_square_reduce(felem out, const felem in)
	693	{
	694	largefelem tmp;
	695	felem_square(tmp, in);
	696	felem_reduce(out, tmp);
	697	}
	698
	699	static void felem_mul_reduce(felem out, const felem in1, const felem in2)
	700	{
	701	largefelem tmp;
	702	felem_mul(tmp, in1, in2);
	703	felem_reduce(out, tmp);
	704	}
	705
	706	/*-
	707	* felem_inv calculates \|out\| = \|in\|^{-1}
	708	*
	709	* Based on Fermat's Little Theorem:
	710	* a^p = a (mod p)
	711	* a^{p-1} = 1 (mod p)
	712	* a^{p-2} = a^{-1} (mod p)
	713	*/
	714	static void felem_inv(felem out, const felem in)
	715	{
	716	felem ftmp, ftmp2, ftmp3, ftmp4;
	717	largefelem tmp;
	718	unsigned i;
	719
	720	felem_square(tmp, in);
	721	felem_reduce(ftmp, tmp); /* 2^1 */
	722	felem_mul(tmp, in, ftmp);
	723	felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
	724	felem_assign(ftmp2, ftmp);
	725	felem_square(tmp, ftmp);
	726	felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
	727	felem_mul(tmp, in, ftmp);
	728	felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
	729	felem_square(tmp, ftmp);
	730	felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
	731
	732	felem_square(tmp, ftmp2);
	733	felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
	734	felem_square(tmp, ftmp3);
	735	felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
	736	felem_mul(tmp, ftmp3, ftmp2);
	737	felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
	738
	739	felem_assign(ftmp2, ftmp3);
	740	felem_square(tmp, ftmp3);
	741	felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
	742	felem_square(tmp, ftmp3);
	743	felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
	744	felem_square(tmp, ftmp3);
	745	felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
	746	felem_square(tmp, ftmp3);
	747	felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
	748	felem_assign(ftmp4, ftmp3);
	749	felem_mul(tmp, ftmp3, ftmp);
	750	felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
	751	felem_square(tmp, ftmp4);
	752	felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
	753	felem_mul(tmp, ftmp3, ftmp2);
	754	felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
	755	felem_assign(ftmp2, ftmp3);
	756
	757	for (i = 0; i < 8; i++) {
	758	felem_square(tmp, ftmp3);
	759	felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
	760	}
	761	felem_mul(tmp, ftmp3, ftmp2);
	762	felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
	763	felem_assign(ftmp2, ftmp3);
	764
	765	for (i = 0; i < 16; i++) {
	766	felem_square(tmp, ftmp3);
	767	felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
	768	}
	769	felem_mul(tmp, ftmp3, ftmp2);
	770	felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
	771	felem_assign(ftmp2, ftmp3);
	772
	773	for (i = 0; i < 32; i++) {
	774	felem_square(tmp, ftmp3);
	775	felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
	776	}
	777	felem_mul(tmp, ftmp3, ftmp2);
	778	felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
	779	felem_assign(ftmp2, ftmp3);
	780
	781	for (i = 0; i < 64; i++) {
	782	felem_square(tmp, ftmp3);
	783	felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
	784	}
	785	felem_mul(tmp, ftmp3, ftmp2);
	786	felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
	787	felem_assign(ftmp2, ftmp3);
	788
	789	for (i = 0; i < 128; i++) {
	790	felem_square(tmp, ftmp3);
	791	felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
	792	}
	793	felem_mul(tmp, ftmp3, ftmp2);
	794	felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
	795	felem_assign(ftmp2, ftmp3);
	796
	797	for (i = 0; i < 256; i++) {
	798	felem_square(tmp, ftmp3);
	799	felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
	800	}
	801	felem_mul(tmp, ftmp3, ftmp2);
	802	felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
	803
	804	for (i = 0; i < 9; i++) {
	805	felem_square(tmp, ftmp3);
	806	felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
	807	}
	808	felem_mul(tmp, ftmp3, ftmp4);
	809	felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
	810	felem_mul(tmp, ftmp3, in);
	811	felem_reduce(out, tmp); /* 2^512 - 3 */
	812	}
	813
	814	/* This is 2^521-1, expressed as an felem */
	815	static const felem kPrime = {
	816	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
	817	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
	818	0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
	819	};
	820
	821	/*-
	822	* felem_is_zero returns a limb with all bits set if \|in\| == 0 (mod p) and 0
	823	* otherwise.
	824	* On entry:
	825	* in[i] < 2^59 + 2^14
	826	*/
	827	static limb felem_is_zero(const felem in)
	828	{
	829	felem ftmp;
	830	limb is_zero, is_p;
	831	felem_assign(ftmp, in);
	832
	833	ftmp[0] += ftmp[8] >> 57;
	834	ftmp[8] &= bottom57bits;
	835	/* ftmp[8] < 2^57 */
	836	ftmp[1] += ftmp[0] >> 58;
	837	ftmp[0] &= bottom58bits;
	838	ftmp[2] += ftmp[1] >> 58;
	839	ftmp[1] &= bottom58bits;
	840	ftmp[3] += ftmp[2] >> 58;
	841	ftmp[2] &= bottom58bits;
	842	ftmp[4] += ftmp[3] >> 58;
	843	ftmp[3] &= bottom58bits;
	844	ftmp[5] += ftmp[4] >> 58;
	845	ftmp[4] &= bottom58bits;
	846	ftmp[6] += ftmp[5] >> 58;
	847	ftmp[5] &= bottom58bits;
	848	ftmp[7] += ftmp[6] >> 58;
	849	ftmp[6] &= bottom58bits;
	850	ftmp[8] += ftmp[7] >> 58;
	851	ftmp[7] &= bottom58bits;
	852	/* ftmp[8] < 2^57 + 4 */
	853
	854	/*
	855	* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
	856	* than our bound for ftmp[8]. Therefore we only have to check if the
	857	* zero is zero or 2^521-1.
	858	*/
	859
	860	is_zero = 0;
	861	is_zero \|= ftmp[0];
	862	is_zero \|= ftmp[1];
	863	is_zero \|= ftmp[2];
	864	is_zero \|= ftmp[3];
	865	is_zero \|= ftmp[4];
	866	is_zero \|= ftmp[5];
	867	is_zero \|= ftmp[6];
	868	is_zero \|= ftmp[7];
	869	is_zero \|= ftmp[8];
	870
	871	is_zero--;
	872	/*
	873	* We know that ftmp[i] < 2^63, therefore the only way that the top bit
	874	* can be set is if is_zero was 0 before the decrement.
	875	*/
	876	is_zero = ((s64) is_zero) >> 63;
	877
	878	is_p = ftmp[0] ^ kPrime[0];
	879	is_p \|= ftmp[1] ^ kPrime[1];
	880	is_p \|= ftmp[2] ^ kPrime[2];
	881	is_p \|= ftmp[3] ^ kPrime[3];
	882	is_p \|= ftmp[4] ^ kPrime[4];
	883	is_p \|= ftmp[5] ^ kPrime[5];
	884	is_p \|= ftmp[6] ^ kPrime[6];
	885	is_p \|= ftmp[7] ^ kPrime[7];
	886	is_p \|= ftmp[8] ^ kPrime[8];
	887
	888	is_p--;
	889	is_p = ((s64) is_p) >> 63;
	890
	891	is_zero \|= is_p;
	892	return is_zero;
	893	}
	894
	895	static int felem_is_zero_int(const felem in)
	896	{
	897	return (int)(felem_is_zero(in) & ((limb) 1));
	898	}
	899
	900	/*-
	901	* felem_contract converts \|in\| to its unique, minimal representation.
	902	* On entry:
	903	* in[i] < 2^59 + 2^14
	904	*/
	905	static void felem_contract(felem out, const felem in)
	906	{
	907	limb is_p, is_greater, sign;
	908	static const limb two58 = ((limb) 1) << 58;
	909
	910	felem_assign(out, in);
	911
	912	out[0] += out[8] >> 57;
	913	out[8] &= bottom57bits;
	914	/* out[8] < 2^57 */
	915	out[1] += out[0] >> 58;
	916	out[0] &= bottom58bits;
	917	out[2] += out[1] >> 58;
	918	out[1] &= bottom58bits;
	919	out[3] += out[2] >> 58;
	920	out[2] &= bottom58bits;
	921	out[4] += out[3] >> 58;
	922	out[3] &= bottom58bits;
	923	out[5] += out[4] >> 58;
	924	out[4] &= bottom58bits;
	925	out[6] += out[5] >> 58;
	926	out[5] &= bottom58bits;
	927	out[7] += out[6] >> 58;
	928	out[6] &= bottom58bits;
	929	out[8] += out[7] >> 58;
	930	out[7] &= bottom58bits;
	931	/* out[8] < 2^57 + 4 */
	932
	933	/*
	934	* If the value is greater than 2^521-1 then we have to subtract 2^521-1
	935	* out. See the comments in felem_is_zero regarding why we don't test for
	936	* other multiples of the prime.
	937	*/
	938
	939	/*
	940	* First, if \|out\| is equal to 2^521-1, we subtract it out to get zero.
	941	*/
	942
	943	is_p = out[0] ^ kPrime[0];
	944	is_p \|= out[1] ^ kPrime[1];
	945	is_p \|= out[2] ^ kPrime[2];
	946	is_p \|= out[3] ^ kPrime[3];
	947	is_p \|= out[4] ^ kPrime[4];
	948	is_p \|= out[5] ^ kPrime[5];
	949	is_p \|= out[6] ^ kPrime[6];
	950	is_p \|= out[7] ^ kPrime[7];
	951	is_p \|= out[8] ^ kPrime[8];
	952
	953	is_p--;
	954	is_p &= is_p << 32;
	955	is_p &= is_p << 16;
	956	is_p &= is_p << 8;
	957	is_p &= is_p << 4;
	958	is_p &= is_p << 2;
	959	is_p &= is_p << 1;
	960	is_p = ((s64) is_p) >> 63;
	961	is_p = ~is_p;
	962
	963	/* is_p is 0 iff \|out\| == 2^521-1 and all ones otherwise */
	964
	965	out[0] &= is_p;
	966	out[1] &= is_p;
	967	out[2] &= is_p;
	968	out[3] &= is_p;
	969	out[4] &= is_p;
	970	out[5] &= is_p;
	971	out[6] &= is_p;
	972	out[7] &= is_p;
	973	out[8] &= is_p;
	974
	975	/*
	976	* In order to test that \|out\| >= 2^521-1 we need only test if out[8] >>
	977	* 57 is greater than zero as (2^521-1) + x >= 2^522
	978	*/
	979	is_greater = out[8] >> 57;
	980	is_greater \|= is_greater << 32;
	981	is_greater \|= is_greater << 16;
	982	is_greater \|= is_greater << 8;
	983	is_greater \|= is_greater << 4;
	984	is_greater \|= is_greater << 2;
	985	is_greater \|= is_greater << 1;
	986	is_greater = ((s64) is_greater) >> 63;
	987
	988	out[0] -= kPrime[0] & is_greater;
	989	out[1] -= kPrime[1] & is_greater;
	990	out[2] -= kPrime[2] & is_greater;
	991	out[3] -= kPrime[3] & is_greater;
	992	out[4] -= kPrime[4] & is_greater;
	993	out[5] -= kPrime[5] & is_greater;
	994	out[6] -= kPrime[6] & is_greater;
	995	out[7] -= kPrime[7] & is_greater;
	996	out[8] -= kPrime[8] & is_greater;
	997
	998	/* Eliminate negative coefficients */
	999	sign = -(out[0] >> 63);
	1000	out[0] += (two58 & sign);
	1001	out[1] -= (1 & sign);
	1002	sign = -(out[1] >> 63);
	1003	out[1] += (two58 & sign);
	1004	out[2] -= (1 & sign);
	1005	sign = -(out[2] >> 63);
	1006	out[2] += (two58 & sign);
	1007	out[3] -= (1 & sign);
	1008	sign = -(out[3] >> 63);
	1009	out[3] += (two58 & sign);
	1010	out[4] -= (1 & sign);
	1011	sign = -(out[4] >> 63);
	1012	out[4] += (two58 & sign);
	1013	out[5] -= (1 & sign);
	1014	sign = -(out[0] >> 63);
	1015	out[5] += (two58 & sign);
	1016	out[6] -= (1 & sign);
	1017	sign = -(out[6] >> 63);
	1018	out[6] += (two58 & sign);
	1019	out[7] -= (1 & sign);
	1020	sign = -(out[7] >> 63);
	1021	out[7] += (two58 & sign);
	1022	out[8] -= (1 & sign);
	1023	sign = -(out[5] >> 63);
	1024	out[5] += (two58 & sign);
	1025	out[6] -= (1 & sign);
	1026	sign = -(out[6] >> 63);
	1027	out[6] += (two58 & sign);
	1028	out[7] -= (1 & sign);
	1029	sign = -(out[7] >> 63);
	1030	out[7] += (two58 & sign);
	1031	out[8] -= (1 & sign);
	1032	}
	1033
	1034	/*-
	1035	* Group operations
	1036	* ----------------
	1037	*
	1038	* Building on top of the field operations we have the operations on the
	1039	* elliptic curve group itself. Points on the curve are represented in Jacobian
	1040	* coordinates */
	1041
	1042	/*-
	1043	* point_double calculates 2*(x_in, y_in, z_in)
	1044	*
	1045	* The method is taken from:
	1046	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
	1047	*
	1048	* Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
	1049	* while x_out == y_in is not (maybe this works, but it's not tested). */
	1050	static void
	1051	point_double(felem x_out, felem y_out, felem z_out,
	1052	const felem x_in, const felem y_in, const felem z_in)
	1053	{
	1054	largefelem tmp, tmp2;
	1055	felem delta, gamma, beta, alpha, ftmp, ftmp2;
	1056
	1057	felem_assign(ftmp, x_in);
	1058	felem_assign(ftmp2, x_in);
	1059
	1060	/* delta = z^2 */
	1061	felem_square(tmp, z_in);
	1062	felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
	1063
	1064	/* gamma = y^2 */
	1065	felem_square(tmp, y_in);
	1066	felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
	1067
	1068	/* beta = xgamma /
	1069	felem_mul(tmp, x_in, gamma);
	1070	felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
	1071
	1072	/* alpha = 3(x-delta)(x+delta) */
	1073	felem_diff64(ftmp, delta);
	1074	/* ftmp[i] < 2^61 */
	1075	felem_sum64(ftmp2, delta);
	1076	/* ftmp2[i] < 2^60 + 2^15 */
	1077	felem_scalar64(ftmp2, 3);
	1078	/* ftmp2[i] < 32^60 + 32^15 */
	1079	felem_mul(tmp, ftmp, ftmp2);
	1080	/*-
	1081	* tmp[i] < 17(32^121 + 32^76)
	1082	* = 612^121 + 612^76
	1083	* < 642^121 + 642^76
	1084	* = 2^127 + 2^82
	1085	* < 2^128
	1086	*/
	1087	felem_reduce(alpha, tmp);
	1088
	1089	/* x' = alpha^2 - 8beta /
	1090	felem_square(tmp, alpha);
	1091	/*
	1092	* tmp[i] < 17*2^120 < 2^125
	1093	*/
	1094	felem_assign(ftmp, beta);
	1095	felem_scalar64(ftmp, 8);
	1096	/* ftmp[i] < 2^62 + 2^17 */
	1097	felem_diff_128_64(tmp, ftmp);
	1098	/* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
	1099	felem_reduce(x_out, tmp);
	1100
	1101	/* z' = (y + z)^2 - gamma - delta */
	1102	felem_sum64(delta, gamma);
	1103	/* delta[i] < 2^60 + 2^15 */
	1104	felem_assign(ftmp, y_in);
	1105	felem_sum64(ftmp, z_in);
	1106	/* ftmp[i] < 2^60 + 2^15 */
	1107	felem_square(tmp, ftmp);
	1108	/*
	1109	* tmp[i] < 17(2^122) < 2^127
	1110	*/
	1111	felem_diff_128_64(tmp, delta);
	1112	/* tmp[i] < 2^127 + 2^63 */
	1113	felem_reduce(z_out, tmp);
	1114
	1115	/* y' = alpha(4beta - x') - 8gamma^2 /
	1116	felem_scalar64(beta, 4);
	1117	/* beta[i] < 2^61 + 2^16 */
	1118	felem_diff64(beta, x_out);
	1119	/* beta[i] < 2^61 + 2^60 + 2^16 */
	1120	felem_mul(tmp, alpha, beta);
	1121	/*-
	1122	* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
	1123	* = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
	1124	* = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
	1125	* < 2^128
	1126	*/
	1127	felem_square(tmp2, gamma);
	1128	/*-
	1129	* tmp2[i] < 17*(2^59 + 2^14)^2
	1130	* = 17*(2^118 + 2^74 + 2^28)
	1131	*/
	1132	felem_scalar128(tmp2, 8);
	1133	/*-
	1134	* tmp2[i] < 817(2^118 + 2^74 + 2^28)
	1135	* = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
	1136	* < 2^126
	1137	*/
	1138	felem_diff128(tmp, tmp2);
	1139	/*-
	1140	* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
	1141	* = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
	1142	* 2^74 + 2^69 + 2^34 + 2^30
	1143	* < 2^128
	1144	*/
	1145	felem_reduce(y_out, tmp);
	1146	}
	1147
	1148	/* copy_conditional copies in to out iff mask is all ones. */
	1149	static void copy_conditional(felem out, const felem in, limb mask)
	1150	{
	1151	unsigned i;
	1152	for (i = 0; i < NLIMBS; ++i) {
	1153	const limb tmp = mask & (in[i] ^ out[i]);
	1154	out[i] ^= tmp;
	1155	}
	1156	}
	1157
	1158	/*-
	1159	* point_add calculates (x1, y1, z1) + (x2, y2, z2)
	1160	*
	1161	* The method is taken from
	1162	* http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
	1163	* adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
	1164	*
	1165	* This function includes a branch for checking whether the two input points
	1166	* are equal (while not equal to the point at infinity). This case never
	1167	* happens during single point multiplication, so there is no timing leak for
	1168	* ECDH or ECDSA signing. */
	1169	static void point_add(felem x3, felem y3, felem z3,
	1170	const felem x1, const felem y1, const felem z1,
	1171	const int mixed, const felem x2, const felem y2,
	1172	const felem z2)
	1173	{
	1174	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
	1175	largefelem tmp, tmp2;
	1176	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
	1177
	1178	z1_is_zero = felem_is_zero(z1);
	1179	z2_is_zero = felem_is_zero(z2);
	1180
	1181	/* ftmp = z1z1 = z1*2 /
	1182	felem_square(tmp, z1);
	1183	felem_reduce(ftmp, tmp);
	1184
	1185	if (!mixed) {
	1186	/* ftmp2 = z2z2 = z2*2 /
	1187	felem_square(tmp, z2);
	1188	felem_reduce(ftmp2, tmp);
	1189
	1190	/* u1 = ftmp3 = x1z2z2 /
	1191	felem_mul(tmp, x1, ftmp2);
	1192	felem_reduce(ftmp3, tmp);
	1193
	1194	/* ftmp5 = z1 + z2 */
	1195	felem_assign(ftmp5, z1);
	1196	felem_sum64(ftmp5, z2);
	1197	/* ftmp5[i] < 2^61 */
	1198
	1199	/* ftmp5 = (z1 + z2)*2 - z1z1 - z2z2 = 2z1z2 */
	1200	felem_square(tmp, ftmp5);
	1201	/* tmp[i] < 172^122 /
	1202	felem_diff_128_64(tmp, ftmp);
	1203	/* tmp[i] < 172^122 + 2^63 /
	1204	felem_diff_128_64(tmp, ftmp2);
	1205	/* tmp[i] < 172^122 + 2^64 /
	1206	felem_reduce(ftmp5, tmp);
	1207
	1208	/* ftmp2 = z2 * z2z2 */
	1209	felem_mul(tmp, ftmp2, z2);
	1210	felem_reduce(ftmp2, tmp);
	1211
	1212	/* s1 = ftmp6 = y1 * z2*3 /
	1213	felem_mul(tmp, y1, ftmp2);
	1214	felem_reduce(ftmp6, tmp);
	1215	} else {
	1216	/*
	1217	* We'll assume z2 = 1 (special case z2 = 0 is handled later)
	1218	*/
	1219
	1220	/* u1 = ftmp3 = x1z2z2 /
	1221	felem_assign(ftmp3, x1);
	1222
	1223	/* ftmp5 = 2z1z2 /
	1224	felem_scalar(ftmp5, z1, 2);
	1225
	1226	/* s1 = ftmp6 = y1 * z2*3 /
	1227	felem_assign(ftmp6, y1);
	1228	}
	1229
	1230	/* u2 = x2z1z1 /
	1231	felem_mul(tmp, x2, ftmp);
	1232	/* tmp[i] < 172^120 /
	1233
	1234	/* h = ftmp4 = u2 - u1 */
	1235	felem_diff_128_64(tmp, ftmp3);
	1236	/* tmp[i] < 172^120 + 2^63 /
	1237	felem_reduce(ftmp4, tmp);
	1238
	1239	x_equal = felem_is_zero(ftmp4);
	1240
	1241	/* z_out = ftmp5 * h */
	1242	felem_mul(tmp, ftmp5, ftmp4);
	1243	felem_reduce(z_out, tmp);
	1244
	1245	/* ftmp = z1 * z1z1 */
	1246	felem_mul(tmp, ftmp, z1);
	1247	felem_reduce(ftmp, tmp);
	1248
	1249	/* s2 = tmp = y2 * z1*3 /
	1250	felem_mul(tmp, y2, ftmp);
	1251	/* tmp[i] < 172^120 /
	1252
	1253	/* r = ftmp5 = (s2 - s1)2 /
	1254	felem_diff_128_64(tmp, ftmp6);
	1255	/* tmp[i] < 172^120 + 2^63 /
	1256	felem_reduce(ftmp5, tmp);
	1257	y_equal = felem_is_zero(ftmp5);
	1258	felem_scalar64(ftmp5, 2);
	1259	/* ftmp5[i] < 2^61 */
	1260
	1261	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
	1262	point_double(x3, y3, z3, x1, y1, z1);
	1263	return;
	1264	}
	1265
	1266	/* I = ftmp = (2h)*2 /
	1267	felem_assign(ftmp, ftmp4);
	1268	felem_scalar64(ftmp, 2);
	1269	/* ftmp[i] < 2^61 */
	1270	felem_square(tmp, ftmp);
	1271	/* tmp[i] < 172^122 /
	1272	felem_reduce(ftmp, tmp);
	1273
	1274	/* J = ftmp2 = h * I */
	1275	felem_mul(tmp, ftmp4, ftmp);
	1276	felem_reduce(ftmp2, tmp);
	1277
	1278	/* V = ftmp4 = U1 * I */
	1279	felem_mul(tmp, ftmp3, ftmp);
	1280	felem_reduce(ftmp4, tmp);
	1281
	1282	/* x_out = r*2 - J - 2V /
	1283	felem_square(tmp, ftmp5);
	1284	/* tmp[i] < 172^122 /
	1285	felem_diff_128_64(tmp, ftmp2);
	1286	/* tmp[i] < 172^122 + 2^63 /
	1287	felem_assign(ftmp3, ftmp4);
	1288	felem_scalar64(ftmp4, 2);
	1289	/* ftmp4[i] < 2^61 */
	1290	felem_diff_128_64(tmp, ftmp4);
	1291	/* tmp[i] < 172^122 + 2^64 /
	1292	felem_reduce(x_out, tmp);
	1293
	1294	/* y_out = r(V-x_out) - 2 * s1 * J */
	1295	felem_diff64(ftmp3, x_out);
	1296	/*
	1297	* ftmp3[i] < 2^60 + 2^60 = 2^61
	1298	*/
	1299	felem_mul(tmp, ftmp5, ftmp3);
	1300	/* tmp[i] < 172^122 /
	1301	felem_mul(tmp2, ftmp6, ftmp2);
	1302	/* tmp2[i] < 172^120 /
	1303	felem_scalar128(tmp2, 2);
	1304	/* tmp2[i] < 172^121 /
	1305	felem_diff128(tmp, tmp2);
	1306	/*-
	1307	* tmp[i] < 2^127 - 2^69 + 17*2^122
	1308	* = 2^126 - 2^122 - 2^6 - 2^2 - 1
	1309	* < 2^127
	1310	*/
	1311	felem_reduce(y_out, tmp);
	1312
	1313	copy_conditional(x_out, x2, z1_is_zero);
	1314	copy_conditional(x_out, x1, z2_is_zero);
	1315	copy_conditional(y_out, y2, z1_is_zero);
	1316	copy_conditional(y_out, y1, z2_is_zero);
	1317	copy_conditional(z_out, z2, z1_is_zero);
	1318	copy_conditional(z_out, z1, z2_is_zero);
	1319	felem_assign(x3, x_out);
	1320	felem_assign(y3, y_out);
	1321	felem_assign(z3, z_out);
	1322	}
	1323
	1324	/*-
	1325	* Base point pre computation
	1326	* --------------------------
	1327	*
	1328	* Two different sorts of precomputed tables are used in the following code.
	1329	* Each contain various points on the curve, where each point is three field
	1330	* elements (x, y, z).
	1331	*
	1332	* For the base point table, z is usually 1 (0 for the point at infinity).
	1333	* This table has 16 elements:
	1334	* index \| bits \| point
	1335	* ------+---------+------------------------------
	1336	* 0 \| 0 0 0 0 \| 0G
	1337	* 1 \| 0 0 0 1 \| 1G
	1338	* 2 \| 0 0 1 0 \| 2^130G
	1339	* 3 \| 0 0 1 1 \| (2^130 + 1)G
	1340	* 4 \| 0 1 0 0 \| 2^260G
	1341	* 5 \| 0 1 0 1 \| (2^260 + 1)G
	1342	* 6 \| 0 1 1 0 \| (2^260 + 2^130)G
	1343	* 7 \| 0 1 1 1 \| (2^260 + 2^130 + 1)G
	1344	* 8 \| 1 0 0 0 \| 2^390G
	1345	* 9 \| 1 0 0 1 \| (2^390 + 1)G
	1346	* 10 \| 1 0 1 0 \| (2^390 + 2^130)G
	1347	* 11 \| 1 0 1 1 \| (2^390 + 2^130 + 1)G
	1348	* 12 \| 1 1 0 0 \| (2^390 + 2^260)G
	1349	* 13 \| 1 1 0 1 \| (2^390 + 2^260 + 1)G
	1350	* 14 \| 1 1 1 0 \| (2^390 + 2^260 + 2^130)G
	1351	* 15 \| 1 1 1 1 \| (2^390 + 2^260 + 2^130 + 1)G
	1352	*
	1353	* The reason for this is so that we can clock bits into four different
	1354	* locations when doing simple scalar multiplies against the base point.
	1355	*
	1356	* Tables for other points have table[i] = iG for i in 0 .. 16. */
	1357
	1358	/* gmul is the table of precomputed base points */
	1359	static const felem gmul[16][3] = {
	1360	{{0, 0, 0, 0, 0, 0, 0, 0, 0},
	1361	{0, 0, 0, 0, 0, 0, 0, 0, 0},
	1362	{0, 0, 0, 0, 0, 0, 0, 0, 0}},
	1363	{{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
	1364	0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
	1365	0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
	1366	{0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
	1367	0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
	1368	0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
	1369	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1370	{{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
	1371	0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
	1372	0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
	1373	{0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
	1374	0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
	1375	0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
	1376	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1377	{{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
	1378	0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
	1379	0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
	1380	{0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
	1381	0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
	1382	0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
	1383	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1384	{{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
	1385	0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
	1386	0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
	1387	{0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
	1388	0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
	1389	0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
	1390	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1391	{{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
	1392	0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
	1393	0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
	1394	{0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
	1395	0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
	1396	0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
	1397	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1398	{{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
	1399	0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
	1400	0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
	1401	{0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
	1402	0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
	1403	0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
	1404	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1405	{{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
	1406	0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
	1407	0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
	1408	{0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
	1409	0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
	1410	0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
	1411	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1412	{{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
	1413	0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
	1414	0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
	1415	{0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
	1416	0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
	1417	0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
	1418	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1419	{{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
	1420	0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
	1421	0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
	1422	{0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
	1423	0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
	1424	0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
	1425	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1426	{{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
	1427	0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
	1428	0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
	1429	{0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
	1430	0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
	1431	0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
	1432	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1433	{{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
	1434	0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
	1435	0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
	1436	{0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
	1437	0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
	1438	0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
	1439	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1440	{{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
	1441	0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
	1442	0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
	1443	{0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
	1444	0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
	1445	0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
	1446	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1447	{{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
	1448	0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
	1449	0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
	1450	{0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
	1451	0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
	1452	0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
	1453	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1454	{{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
	1455	0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
	1456	0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
	1457	{0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
	1458	0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
	1459	0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
	1460	{1, 0, 0, 0, 0, 0, 0, 0, 0}},
	1461	{{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
	1462	0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
	1463	0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
	1464	{0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
	1465	0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
	1466	0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
	1467	{1, 0, 0, 0, 0, 0, 0, 0, 0}}
	1468	};
	1469
	1470	/*
	1471	* select_point selects the \|idx\|th point from a precomputation table and
	1472	* copies it to out.
	1473	*/
	1474	/* pre_comp below is of the size provided in \|size\| */
	1475	static void select_point(const limb idx, unsigned int size,
	1476	const felem pre_comp[][3], felem out[3])
	1477	{
	1478	unsigned i, j;
	1479	limb *outlimbs = &out[0][0];
	1480
	1481	memset(out, 0, sizeof(out) 3);
	1482
	1483	for (i = 0; i < size; i++) {
	1484	const limb *inlimbs = &pre_comp[i][0][0];
	1485	limb mask = i ^ idx;
	1486	mask \|= mask >> 4;
	1487	mask \|= mask >> 2;
	1488	mask \|= mask >> 1;
	1489	mask &= 1;
	1490	mask--;
	1491	for (j = 0; j < NLIMBS * 3; j++)
	1492	outlimbs[j] \|= inlimbs[j] & mask;
	1493	}
	1494	}
	1495
	1496	/* get_bit returns the \|i\|th bit in \|in\| */
	1497	static char get_bit(const felem_bytearray in, int i)
	1498	{
	1499	if (i < 0)
	1500	return 0;
	1501	return (in[i >> 3] >> (i & 7)) & 1;
	1502	}
	1503
	1504	/*
	1505	* Interleaved point multiplication using precomputed point multiples: The
	1506	* small point multiples 0P, 1P, ..., 16*P are in pre_comp[], the scalars
	1507	* in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
	1508	* generator, using certain (large) precomputed multiples in g_pre_comp.
	1509	* Output point (X, Y, Z) is stored in x_out, y_out, z_out
	1510	*/
	1511	static void batch_mul(felem x_out, felem y_out, felem z_out,
	1512	const felem_bytearray scalars[],
	1513	const unsigned num_points, const u8 *g_scalar,
	1514	const int mixed, const felem pre_comp[][17][3],
	1515	const felem g_pre_comp[16][3])
	1516	{
	1517	int i, skip;
	1518	unsigned num, gen_mul = (g_scalar != NULL);
	1519	felem nq[3], tmp[4];
	1520	limb bits;
	1521	u8 sign, digit;
	1522
	1523	/* set nq to the point at infinity */
	1524	memset(nq, 0, sizeof(nq));
	1525
	1526	/*
	1527	* Loop over all scalars msb-to-lsb, interleaving additions of multiples
	1528	* of the generator (last quarter of rounds) and additions of other
	1529	* points multiples (every 5th round).
	1530	*/
	1531	skip = 1; /* save two point operations in the first
	1532	* round */
	1533	for (i = (num_points ? 520 : 130); i >= 0; --i) {
	1534	/* double */
	1535	if (!skip)
	1536	point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
	1537
	1538	/* add multiples of the generator */
	1539	if (gen_mul && (i <= 130)) {
	1540	bits = get_bit(g_scalar, i + 390) << 3;
	1541	if (i < 130) {
	1542	bits \|= get_bit(g_scalar, i + 260) << 2;
	1543	bits \|= get_bit(g_scalar, i + 130) << 1;
	1544	bits \|= get_bit(g_scalar, i);
	1545	}
	1546	/* select the point to add, in constant time */
	1547	select_point(bits, 16, g_pre_comp, tmp);
	1548	if (!skip) {
	1549	/* The 1 argument below is for "mixed" */
	1550	point_add(nq[0], nq[1], nq[2],
	1551	nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
	1552	} else {
	1553	memcpy(nq, tmp, 3 * sizeof(felem));
	1554	skip = 0;
	1555	}
	1556	}
	1557
	1558	/* do other additions every 5 doublings */
	1559	if (num_points && (i % 5 == 0)) {
	1560	/* loop over all scalars */
	1561	for (num = 0; num < num_points; ++num) {
	1562	bits = get_bit(scalars[num], i + 4) << 5;
	1563	bits \|= get_bit(scalars[num], i + 3) << 4;
	1564	bits \|= get_bit(scalars[num], i + 2) << 3;
	1565	bits \|= get_bit(scalars[num], i + 1) << 2;
	1566	bits \|= get_bit(scalars[num], i) << 1;
	1567	bits \|= get_bit(scalars[num], i - 1);
	1568	ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
	1569
	1570	/*
	1571	* select the point to add or subtract, in constant time
	1572	*/
	1573	select_point(digit, 17, pre_comp[num], tmp);
	1574	felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
	1575	* point */
	1576	copy_conditional(tmp[1], tmp[3], (-(limb) sign));
	1577
	1578	if (!skip) {
	1579	point_add(nq[0], nq[1], nq[2],
	1580	nq[0], nq[1], nq[2],
	1581	mixed, tmp[0], tmp[1], tmp[2]);
	1582	} else {
	1583	memcpy(nq, tmp, 3 * sizeof(felem));
	1584	skip = 0;
	1585	}
	1586	}
	1587	}
	1588	}
	1589	felem_assign(x_out, nq[0]);
	1590	felem_assign(y_out, nq[1]);
	1591	felem_assign(z_out, nq[2]);
	1592	}
	1593
	1594	/* Precomputation for the group generator. */
	1595	struct nistp521_pre_comp_st {
	1596	felem g_pre_comp[16][3];
	1597	int references;
	1598	CRYPTO_RWLOCK *lock;
	1599	};
	1600
	1601	const EC_METHOD *EC_GFp_nistp521_method(void)
	1602	{
	1603	static const EC_METHOD ret = {
	1604	EC_FLAGS_DEFAULT_OCT,
	1605	NID_X9_62_prime_field,
	1606	ec_GFp_nistp521_group_init,
	1607	ec_GFp_simple_group_finish,
	1608	ec_GFp_simple_group_clear_finish,
	1609	ec_GFp_nist_group_copy,
	1610	ec_GFp_nistp521_group_set_curve,
	1611	ec_GFp_simple_group_get_curve,
	1612	ec_GFp_simple_group_get_degree,
	1613	ec_group_simple_order_bits,
	1614	ec_GFp_simple_group_check_discriminant,
	1615	ec_GFp_simple_point_init,
	1616	ec_GFp_simple_point_finish,
	1617	ec_GFp_simple_point_clear_finish,
	1618	ec_GFp_simple_point_copy,
	1619	ec_GFp_simple_point_set_to_infinity,
	1620	ec_GFp_simple_set_Jprojective_coordinates_GFp,
	1621	ec_GFp_simple_get_Jprojective_coordinates_GFp,
	1622	ec_GFp_simple_point_set_affine_coordinates,
	1623	ec_GFp_nistp521_point_get_affine_coordinates,
	1624	0 /* point_set_compressed_coordinates */ ,
	1625	0 /* point2oct */ ,
	1626	0 /* oct2point */ ,
	1627	ec_GFp_simple_add,
	1628	ec_GFp_simple_dbl,
	1629	ec_GFp_simple_invert,
	1630	ec_GFp_simple_is_at_infinity,
	1631	ec_GFp_simple_is_on_curve,
	1632	ec_GFp_simple_cmp,
	1633	ec_GFp_simple_make_affine,
	1634	ec_GFp_simple_points_make_affine,
	1635	ec_GFp_nistp521_points_mul,
	1636	ec_GFp_nistp521_precompute_mult,
	1637	ec_GFp_nistp521_have_precompute_mult,
	1638	ec_GFp_nist_field_mul,
	1639	ec_GFp_nist_field_sqr,
	1640	0 /* field_div */ ,
	1641	0 /* field_encode */ ,
	1642	0 /* field_decode */ ,
	1643	0, /* field_set_to_one */
	1644	ec_key_simple_priv2oct,
	1645	ec_key_simple_oct2priv,
	1646	0, /* set private */
	1647	ec_key_simple_generate_key,
	1648	ec_key_simple_check_key,
	1649	ec_key_simple_generate_public_key,
	1650	0, /* keycopy */
	1651	0, /* keyfinish */
	1652	ecdh_simple_compute_key
	1653	};
	1654
	1655	return &ret;
	1656	}
	1657
	1658	/******************************************************************************/
	1659	/*
	1660	* FUNCTIONS TO MANAGE PRECOMPUTATION
	1661	*/
	1662
	1663	static NISTP521_PRE_COMP *nistp521_pre_comp_new()
	1664	{
	1665	NISTP521_PRE_COMP ret = OPENSSL_zalloc(sizeof(ret));
	1666
	1667	if (ret == NULL) {
	1668	ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
	1669	return ret;
	1670	}
	1671
	1672	ret->references = 1;
	1673
	1674	ret->lock = CRYPTO_THREAD_lock_new();
	1675	if (ret->lock == NULL) {
	1676	ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
	1677	OPENSSL_free(ret);
	1678	return NULL;
	1679	}
	1680	return ret;
	1681	}
	1682
	1683	NISTP521_PRE_COMP EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP p)
	1684	{
	1685	int i;
	1686	if (p != NULL)
	1687	CRYPTO_atomic_add(&p->references, 1, &i, p->lock);
	1688	return p;
	1689	}
	1690
	1691	void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
	1692	{
	1693	int i;
	1694
	1695	if (p == NULL)
	1696	return;
	1697
	1698	CRYPTO_atomic_add(&p->references, -1, &i, p->lock);
	1699	REF_PRINT_COUNT("EC_nistp521", x);
	1700	if (i > 0)
	1701	return;
	1702	REF_ASSERT_ISNT(i < 0);
	1703
	1704	CRYPTO_THREAD_lock_free(p->lock);
	1705	OPENSSL_free(p);
	1706	}
	1707
	1708	/******************************************************************************/
	1709	/*
	1710	* OPENSSL EC_METHOD FUNCTIONS
	1711	*/
	1712
	1713	int ec_GFp_nistp521_group_init(EC_GROUP *group)
	1714	{
	1715	int ret;
	1716	ret = ec_GFp_simple_group_init(group);
	1717	group->a_is_minus3 = 1;
	1718	return ret;
	1719	}
	1720
	1721	int ec_GFp_nistp521_group_set_curve(EC_GROUP group, const BIGNUM p,
	1722	const BIGNUM a, const BIGNUM b,
	1723	BN_CTX *ctx)
	1724	{
	1725	int ret = 0;
	1726	BN_CTX *new_ctx = NULL;
	1727	BIGNUM curve_p, curve_a, *curve_b;
	1728
	1729	if (ctx == NULL)
	1730	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
	1731	return 0;
	1732	BN_CTX_start(ctx);
	1733	if (((curve_p = BN_CTX_get(ctx)) == NULL) \|\|
	1734	((curve_a = BN_CTX_get(ctx)) == NULL) \|\|
	1735	((curve_b = BN_CTX_get(ctx)) == NULL))
	1736	goto err;
	1737	BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
	1738	BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
	1739	BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
	1740	if ((BN_cmp(curve_p, p)) \|\| (BN_cmp(curve_a, a)) \|\| (BN_cmp(curve_b, b))) {
	1741	ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
	1742	EC_R_WRONG_CURVE_PARAMETERS);
	1743	goto err;
	1744	}
	1745	group->field_mod_func = BN_nist_mod_521;
	1746	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
	1747	err:
	1748	BN_CTX_end(ctx);
	1749	BN_CTX_free(new_ctx);
	1750	return ret;
	1751	}
	1752
	1753	/*
	1754	* Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
	1755	* (X/Z^2, Y/Z^3)
	1756	*/
	1757	int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
	1758	const EC_POINT *point,
	1759	BIGNUM x, BIGNUM y,
	1760	BN_CTX *ctx)
	1761	{
	1762	felem z1, z2, x_in, y_in, x_out, y_out;
	1763	largefelem tmp;
	1764
	1765	if (EC_POINT_is_at_infinity(group, point)) {
	1766	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
	1767	EC_R_POINT_AT_INFINITY);
	1768	return 0;
	1769	}
	1770	if ((!BN_to_felem(x_in, point->X)) \|\| (!BN_to_felem(y_in, point->Y)) \|\|
	1771	(!BN_to_felem(z1, point->Z)))
	1772	return 0;
	1773	felem_inv(z2, z1);
	1774	felem_square(tmp, z2);
	1775	felem_reduce(z1, tmp);
	1776	felem_mul(tmp, x_in, z1);
	1777	felem_reduce(x_in, tmp);
	1778	felem_contract(x_out, x_in);
	1779	if (x != NULL) {
	1780	if (!felem_to_BN(x, x_out)) {
	1781	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
	1782	ERR_R_BN_LIB);
	1783	return 0;
	1784	}
	1785	}
	1786	felem_mul(tmp, z1, z2);
	1787	felem_reduce(z1, tmp);
	1788	felem_mul(tmp, y_in, z1);
	1789	felem_reduce(y_in, tmp);
	1790	felem_contract(y_out, y_in);
	1791	if (y != NULL) {
	1792	if (!felem_to_BN(y, y_out)) {
	1793	ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
	1794	ERR_R_BN_LIB);
	1795	return 0;
	1796	}
	1797	}
	1798	return 1;
	1799	}
	1800
	1801	/* points below is of size \|num\|, and tmp_felems is of size \|num+1/ */
	1802	static void make_points_affine(size_t num, felem points[][3],
	1803	felem tmp_felems[])
	1804	{
	1805	/*
	1806	* Runs in constant time, unless an input is the point at infinity (which
	1807	* normally shouldn't happen).
	1808	*/
	1809	ec_GFp_nistp_points_make_affine_internal(num,
	1810	points,
	1811	sizeof(felem),
	1812	tmp_felems,
	1813	(void ()(void ))felem_one,
	1814	(int ()(const void ))
	1815	felem_is_zero_int,
	1816	(void ()(void , const void *))
	1817	felem_assign,
	1818	(void ()(void , const void *))
	1819	felem_square_reduce, (void (*)
	1820	(void *,
	1821	const void
	1822	*,
	1823	const void
	1824	*))
	1825	felem_mul_reduce,
	1826	(void ()(void , const void *))
	1827	felem_inv,
	1828	(void ()(void , const void *))
	1829	felem_contract);
	1830	}
	1831
	1832	/*
	1833	* Computes scalargenerator + \sum scalars[i]points[i], ignoring NULL
	1834	* values Result is stored in r (r can equal one of the inputs).
	1835	*/
	1836	int ec_GFp_nistp521_points_mul(const EC_GROUP group, EC_POINT r,
	1837	const BIGNUM *scalar, size_t num,
	1838	const EC_POINT *points[],
	1839	const BIGNUM scalars[], BN_CTX ctx)
	1840	{
	1841	int ret = 0;
	1842	int j;
	1843	int mixed = 0;
	1844	BN_CTX *new_ctx = NULL;
	1845	BIGNUM x, y, z, tmp_scalar;
	1846	felem_bytearray g_secret;
	1847	felem_bytearray *secrets = NULL;
	1848	felem (*pre_comp)[17][3] = NULL;
	1849	felem *tmp_felems = NULL;
	1850	felem_bytearray tmp;
	1851	unsigned i, num_bytes;
	1852	int have_pre_comp = 0;
	1853	size_t num_points = num;
	1854	felem x_in, y_in, z_in, x_out, y_out, z_out;
	1855	NISTP521_PRE_COMP *pre = NULL;
	1856	felem(*g_pre_comp)[3] = NULL;
	1857	EC_POINT *generator = NULL;
	1858	const EC_POINT *p = NULL;
	1859	const BIGNUM *p_scalar = NULL;
	1860
	1861	if (ctx == NULL)
	1862	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
	1863	return 0;
	1864	BN_CTX_start(ctx);
	1865	if (((x = BN_CTX_get(ctx)) == NULL) \|\|
	1866	((y = BN_CTX_get(ctx)) == NULL) \|\|
	1867	((z = BN_CTX_get(ctx)) == NULL) \|\|
	1868	((tmp_scalar = BN_CTX_get(ctx)) == NULL))
	1869	goto err;
	1870
	1871	if (scalar != NULL) {
	1872	pre = group->pre_comp.nistp521;
	1873	if (pre)
	1874	/* we have precomputation, try to use it */
	1875	g_pre_comp = &pre->g_pre_comp[0];
	1876	else
	1877	/* try to use the standard precomputation */
	1878	g_pre_comp = (felem(*)[3]) gmul;
	1879	generator = EC_POINT_new(group);
	1880	if (generator == NULL)
	1881	goto err;
	1882	/* get the generator from precomputation */
	1883	if (!felem_to_BN(x, g_pre_comp[1][0]) \|\|
	1884	!felem_to_BN(y, g_pre_comp[1][1]) \|\|
	1885	!felem_to_BN(z, g_pre_comp[1][2])) {
	1886	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
	1887	goto err;
	1888	}
	1889	if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
	1890	generator, x, y, z,
	1891	ctx))
	1892	goto err;
	1893	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
	1894	/* precomputation matches generator */
	1895	have_pre_comp = 1;
	1896	else
	1897	/*
	1898	* we don't have valid precomputation: treat the generator as a
	1899	* random point
	1900	*/
	1901	num_points++;
	1902	}
	1903
	1904	if (num_points > 0) {
	1905	if (num_points >= 2) {
	1906	/*
	1907	* unless we precompute multiples for just one point, converting
	1908	* those into affine form is time well spent
	1909	*/
	1910	mixed = 1;
	1911	}
	1912	secrets = OPENSSL_zalloc(sizeof(secrets) num_points);
	1913	pre_comp = OPENSSL_zalloc(sizeof(pre_comp) num_points);
	1914	if (mixed)
	1915	tmp_felems =
	1916	OPENSSL_malloc(sizeof(tmp_felems) (num_points * 17 + 1));
	1917	if ((secrets == NULL) \|\| (pre_comp == NULL)
	1918	\|\| (mixed && (tmp_felems == NULL))) {
	1919	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
	1920	goto err;
	1921	}
	1922
	1923	/*
	1924	* we treat NULL scalars as 0, and NULL points as points at infinity,
	1925	* i.e., they contribute nothing to the linear combination
	1926	*/
	1927	for (i = 0; i < num_points; ++i) {
	1928	if (i == num)
	1929	/*
	1930	* we didn't have a valid precomputation, so we pick the
	1931	* generator
	1932	*/
	1933	{
	1934	p = EC_GROUP_get0_generator(group);
	1935	p_scalar = scalar;
	1936	} else
	1937	/* the i^th point */
	1938	{
	1939	p = points[i];
	1940	p_scalar = scalars[i];
	1941	}
	1942	if ((p_scalar != NULL) && (p != NULL)) {
	1943	/* reduce scalar to 0 <= scalar < 2^521 */
	1944	if ((BN_num_bits(p_scalar) > 521)
	1945	\|\| (BN_is_negative(p_scalar))) {
	1946	/*
	1947	* this is an unusual input, and we don't guarantee
	1948	* constant-timeness
	1949	*/
	1950	if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
	1951	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
	1952	goto err;
	1953	}
	1954	num_bytes = BN_bn2bin(tmp_scalar, tmp);
	1955	} else
	1956	num_bytes = BN_bn2bin(p_scalar, tmp);
	1957	flip_endian(secrets[i], tmp, num_bytes);
	1958	/* precompute multiples */
	1959	if ((!BN_to_felem(x_out, p->X)) \|\|
	1960	(!BN_to_felem(y_out, p->Y)) \|\|
	1961	(!BN_to_felem(z_out, p->Z)))
	1962	goto err;
	1963	memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
	1964	memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
	1965	memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
	1966	for (j = 2; j <= 16; ++j) {
	1967	if (j & 1) {
	1968	point_add(pre_comp[i][j][0], pre_comp[i][j][1],
	1969	pre_comp[i][j][2], pre_comp[i][1][0],
	1970	pre_comp[i][1][1], pre_comp[i][1][2], 0,
	1971	pre_comp[i][j - 1][0],
	1972	pre_comp[i][j - 1][1],
	1973	pre_comp[i][j - 1][2]);
	1974	} else {
	1975	point_double(pre_comp[i][j][0], pre_comp[i][j][1],
	1976	pre_comp[i][j][2], pre_comp[i][j / 2][0],
	1977	pre_comp[i][j / 2][1],
	1978	pre_comp[i][j / 2][2]);
	1979	}
	1980	}
	1981	}
	1982	}
	1983	if (mixed)
	1984	make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
	1985	}
	1986
	1987	/* the scalar for the generator */
	1988	if ((scalar != NULL) && (have_pre_comp)) {
	1989	memset(g_secret, 0, sizeof(g_secret));
	1990	/* reduce scalar to 0 <= scalar < 2^521 */
	1991	if ((BN_num_bits(scalar) > 521) \|\| (BN_is_negative(scalar))) {
	1992	/*
	1993	* this is an unusual input, and we don't guarantee
	1994	* constant-timeness
	1995	*/
	1996	if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
	1997	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
	1998	goto err;
	1999	}
	2000	num_bytes = BN_bn2bin(tmp_scalar, tmp);
	2001	} else
	2002	num_bytes = BN_bn2bin(scalar, tmp);
	2003	flip_endian(g_secret, tmp, num_bytes);
	2004	/* do the multiplication with generator precomputation */
	2005	batch_mul(x_out, y_out, z_out,
	2006	(const felem_bytearray(*))secrets, num_points,
	2007	g_secret,
	2008	mixed, (const felem(*)[17][3])pre_comp,
	2009	(const felem(*)[3])g_pre_comp);
	2010	} else
	2011	/* do the multiplication without generator precomputation */
	2012	batch_mul(x_out, y_out, z_out,
	2013	(const felem_bytearray(*))secrets, num_points,
	2014	NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
	2015	/* reduce the output to its unique minimal representation */
	2016	felem_contract(x_in, x_out);
	2017	felem_contract(y_in, y_out);
	2018	felem_contract(z_in, z_out);
	2019	if ((!felem_to_BN(x, x_in)) \|\| (!felem_to_BN(y, y_in)) \|\|
	2020	(!felem_to_BN(z, z_in))) {
	2021	ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
	2022	goto err;
	2023	}
	2024	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
	2025
	2026	err:
	2027	BN_CTX_end(ctx);
	2028	EC_POINT_free(generator);
	2029	BN_CTX_free(new_ctx);
	2030	OPENSSL_free(secrets);
	2031	OPENSSL_free(pre_comp);
	2032	OPENSSL_free(tmp_felems);
	2033	return ret;
	2034	}
	2035
	2036	int ec_GFp_nistp521_precompute_mult(EC_GROUP group, BN_CTX ctx)
	2037	{
	2038	int ret = 0;
	2039	NISTP521_PRE_COMP *pre = NULL;
	2040	int i, j;
	2041	BN_CTX *new_ctx = NULL;
	2042	BIGNUM x, y;
	2043	EC_POINT *generator = NULL;
	2044	felem tmp_felems[16];
	2045
	2046	/* throw away old precomputation */
	2047	EC_pre_comp_free(group);
	2048	if (ctx == NULL)
	2049	if ((ctx = new_ctx = BN_CTX_new()) == NULL)
	2050	return 0;
	2051	BN_CTX_start(ctx);
	2052	if (((x = BN_CTX_get(ctx)) == NULL) \|\| ((y = BN_CTX_get(ctx)) == NULL))
	2053	goto err;
	2054	/* get the generator */
	2055	if (group->generator == NULL)
	2056	goto err;
	2057	generator = EC_POINT_new(group);
	2058	if (generator == NULL)
	2059	goto err;
	2060	BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
	2061	BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
	2062	if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
	2063	goto err;
	2064	if ((pre = nistp521_pre_comp_new()) == NULL)
	2065	goto err;
	2066	/*
	2067	* if the generator is the standard one, use built-in precomputation
	2068	*/
	2069	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
	2070	memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
	2071	goto done;
	2072	}
	2073	if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) \|\|
	2074	(!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) \|\|
	2075	(!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
	2076	goto err;
	2077	/* compute 2^130G, 2^260G, 2^390G /
	2078	for (i = 1; i <= 4; i <<= 1) {
	2079	point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
	2080	pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
	2081	pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
	2082	for (j = 0; j < 129; ++j) {
	2083	point_double(pre->g_pre_comp[2 * i][0],
	2084	pre->g_pre_comp[2 * i][1],
	2085	pre->g_pre_comp[2 * i][2],
	2086	pre->g_pre_comp[2 * i][0],
	2087	pre->g_pre_comp[2 * i][1],
	2088	pre->g_pre_comp[2 * i][2]);
	2089	}
	2090	}
	2091	/* g_pre_comp[0] is the point at infinity */
	2092	memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
	2093	/* the remaining multiples */
	2094	/* 2^130G + 2^260G */
	2095	point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
	2096	pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
	2097	pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
	2098	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
	2099	pre->g_pre_comp[2][2]);
	2100	/* 2^130G + 2^390G */
	2101	point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
	2102	pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
	2103	pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
	2104	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
	2105	pre->g_pre_comp[2][2]);
	2106	/* 2^260G + 2^390G */
	2107	point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
	2108	pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
	2109	pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
	2110	0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
	2111	pre->g_pre_comp[4][2]);
	2112	/* 2^130G + 2^260G + 2^390G /
	2113	point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
	2114	pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
	2115	pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
	2116	0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
	2117	pre->g_pre_comp[2][2]);
	2118	for (i = 1; i < 8; ++i) {
	2119	/* odd multiples: add G */
	2120	point_add(pre->g_pre_comp[2 * i + 1][0],
	2121	pre->g_pre_comp[2 * i + 1][1],
	2122	pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
	2123	pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
	2124	pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
	2125	pre->g_pre_comp[1][2]);
	2126	}
	2127	make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
	2128
	2129	done:
	2130	SETPRECOMP(group, nistp521, pre);
	2131	ret = 1;
	2132	pre = NULL;
	2133	err:
	2134	BN_CTX_end(ctx);
	2135	EC_POINT_free(generator);
	2136	BN_CTX_free(new_ctx);
	2137	EC_nistp521_pre_comp_free(pre);
	2138	return ret;
	2139	}
	2140
	2141	int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
	2142	{
	2143	return HAVEPRECOMP(group, nistp521);
	2144	}
	2145
	2146	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: EcnlProtoTool/trunk/openssl-1.1.0e/crypto/ec/ecp_nistp521.c@ 331

Download in other formats: