/ src / ec / ec_p256_m15.c
ec_p256_m15.c
   1  /*
   2   * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3   *
   4   * Permission is hereby granted, free of charge, to any person obtaining 
   5   * a copy of this software and associated documentation files (the
   6   * "Software"), to deal in the Software without restriction, including
   7   * without limitation the rights to use, copy, modify, merge, publish,
   8   * distribute, sublicense, and/or sell copies of the Software, and to
   9   * permit persons to whom the Software is furnished to do so, subject to
  10   * the following conditions:
  11   *
  12   * The above copyright notice and this permission notice shall be 
  13   * included in all copies or substantial portions of the Software.
  14   *
  15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
  16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
  18   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22   * SOFTWARE.
  23   */
  24  
  25  #include "inner.h"
  26  
  27  /*
  28   * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29   * that right-shifting a signed negative integer copies the sign bit
  30   * (arithmetic right-shift). This is "implementation-defined behaviour",
  31   * i.e. it is not undefined, but it may differ between compilers. Each
  32   * compiler is supposed to document its behaviour in that respect. GCC
  33   * explicitly defines that an arithmetic right shift is used. We expect
  34   * all other compilers to do the same, because underlying CPU offer an
  35   * arithmetic right shift opcode that could not be used otherwise.
  36   */
  37  #if BR_NO_ARITH_SHIFT
  38  #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
  39                      | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40  #else
  41  #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
  42  #endif
  43  
  44  /*
  45   * Convert an integer from unsigned big-endian encoding to a sequence of
  46   * 13-bit words in little-endian order. The final "partial" word is
  47   * returned.
  48   */
  49  static uint32_t
  50  be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
  51  {
  52  	uint32_t acc;
  53  	int acc_len;
  54  
  55  	acc = 0;
  56  	acc_len = 0;
  57  	while (len -- > 0) {
  58  		acc |= (uint32_t)src[len] << acc_len;
  59  		acc_len += 8;
  60  		if (acc_len >= 13) {
  61  			*dst ++ = acc & 0x1FFF;
  62  			acc >>= 13;
  63  			acc_len -= 13;
  64  		}
  65  	}
  66  	return acc;
  67  }
  68  
  69  /*
  70   * Convert an integer (13-bit words, little-endian) to unsigned
  71   * big-endian encoding. The total encoding length is provided; all
  72   * the destination bytes will be filled.
  73   */
  74  static void
  75  le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  76  {
  77  	uint32_t acc;
  78  	int acc_len;
  79  
  80  	acc = 0;
  81  	acc_len = 0;
  82  	while (len -- > 0) {
  83  		if (acc_len < 8) {
  84  			acc |= (*src ++) << acc_len;
  85  			acc_len += 13;
  86  		}
  87  		dst[len] = (unsigned char)acc;
  88  		acc >>= 8;
  89  		acc_len -= 8;
  90  	}
  91  }
  92  
  93  /*
  94   * Normalise an array of words to a strict 13 bits per word. Returned
  95   * value is the resulting carry. The source (w) and destination (d)
  96   * arrays may be identical, but shall not overlap partially.
  97   */
  98  static inline uint32_t
  99  norm13(uint32_t *d, const uint32_t *w, size_t len)
 100  {
 101  	size_t u;
 102  	uint32_t cc;
 103  
 104  	cc = 0;
 105  	for (u = 0; u < len; u ++) {
 106  		int32_t z;
 107  
 108  		z = w[u] + cc;
 109  		d[u] = z & 0x1FFF;
 110  		cc = ARSH(z, 13);
 111  	}
 112  	return cc;
 113  }
 114  
 115  /*
 116   * mul20() multiplies two 260-bit integers together. Each word must fit
 117   * on 13 bits; source operands use 20 words, destination operand
 118   * receives 40 words. All overlaps allowed.
 119   *
 120   * square20() computes the square of a 260-bit integer. Each word must
 121   * fit on 13 bits; source operand uses 20 words, destination operand
 122   * receives 40 words. All overlaps allowed.
 123   */
 124  
 125  #if BR_SLOW_MUL15
 126  
 127  static void
 128  mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 129  {
 130  	/*
 131  	 * Two-level Karatsuba: turns a 20x20 multiplication into
 132  	 * nine 5x5 multiplications. We use 13-bit words but do not
 133  	 * propagate carries immediately, so words may expand:
 134  	 *
 135  	 *  - First Karatsuba decomposition turns the 20x20 mul on
 136  	 *    13-bit words into three 10x10 muls, two on 13-bit words
 137  	 *    and one on 14-bit words.
 138  	 *
 139  	 *  - Second Karatsuba decomposition further splits these into:
 140  	 *
 141  	 *     * four 5x5 muls on 13-bit words
 142  	 *     * four 5x5 muls on 14-bit words
 143  	 *     * one 5x5 mul on 15-bit words
 144  	 *
 145  	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
 146  	 * or 15-bit words, respectively.
 147  	 */
 148  	uint32_t u[45], v[45], w[90];
 149  	uint32_t cc;
 150  	int i;
 151  
 152  #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 153  		(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
 154  			+ (s2w)[5 * (s2_off) + 0]; \
 155  		(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
 156  			+ (s2w)[5 * (s2_off) + 1]; \
 157  		(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
 158  			+ (s2w)[5 * (s2_off) + 2]; \
 159  		(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
 160  			+ (s2w)[5 * (s2_off) + 3]; \
 161  		(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
 162  			+ (s2w)[5 * (s2_off) + 4]; \
 163  	} while (0)
 164  
 165  #define ZADDT(dw, d_off, sw, s_off)   do { \
 166  		(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
 167  		(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
 168  		(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
 169  		(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
 170  		(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
 171  	} while (0)
 172  
 173  #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
 174  		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
 175  			+ (s2w)[5 * (s2_off) + 0]; \
 176  		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
 177  			+ (s2w)[5 * (s2_off) + 1]; \
 178  		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
 179  			+ (s2w)[5 * (s2_off) + 2]; \
 180  		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
 181  			+ (s2w)[5 * (s2_off) + 3]; \
 182  		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
 183  			+ (s2w)[5 * (s2_off) + 4]; \
 184  	} while (0)
 185  
 186  #define CPR1(w, cprcc)   do { \
 187  		uint32_t cprz = (w) + cprcc; \
 188  		(w) = cprz & 0x1FFF; \
 189  		cprcc = cprz >> 13; \
 190  	} while (0)
 191  
 192  #define CPR(dw, d_off)   do { \
 193  		uint32_t cprcc; \
 194  		cprcc = 0; \
 195  		CPR1((dw)[(d_off) + 0], cprcc); \
 196  		CPR1((dw)[(d_off) + 1], cprcc); \
 197  		CPR1((dw)[(d_off) + 2], cprcc); \
 198  		CPR1((dw)[(d_off) + 3], cprcc); \
 199  		CPR1((dw)[(d_off) + 4], cprcc); \
 200  		CPR1((dw)[(d_off) + 5], cprcc); \
 201  		CPR1((dw)[(d_off) + 6], cprcc); \
 202  		CPR1((dw)[(d_off) + 7], cprcc); \
 203  		CPR1((dw)[(d_off) + 8], cprcc); \
 204  		(dw)[(d_off) + 9] = cprcc; \
 205  	} while (0)
 206  
 207  	memcpy(u, a, 20 * sizeof *a);
 208  	ZADD(u, 4, a, 0, a, 1);
 209  	ZADD(u, 5, a, 2, a, 3);
 210  	ZADD(u, 6, a, 0, a, 2);
 211  	ZADD(u, 7, a, 1, a, 3);
 212  	ZADD(u, 8, u, 6, u, 7);
 213  
 214  	memcpy(v, b, 20 * sizeof *b);
 215  	ZADD(v, 4, b, 0, b, 1);
 216  	ZADD(v, 5, b, 2, b, 3);
 217  	ZADD(v, 6, b, 0, b, 2);
 218  	ZADD(v, 7, b, 1, b, 3);
 219  	ZADD(v, 8, v, 6, v, 7);
 220  
 221  	/*
 222  	 * Do the eight first 8x8 muls. Source words are at most 16382
 223  	 * each, so we can add product results together "as is" in 32-bit
 224  	 * words.
 225  	 */
 226  	for (i = 0; i < 40; i += 5) {
 227  		w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
 228  		w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
 229  			+ MUL15(u[i + 1], v[i + 0]);
 230  		w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
 231  			+ MUL15(u[i + 1], v[i + 1])
 232  			+ MUL15(u[i + 2], v[i + 0]);
 233  		w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
 234  			+ MUL15(u[i + 1], v[i + 2])
 235  			+ MUL15(u[i + 2], v[i + 1])
 236  			+ MUL15(u[i + 3], v[i + 0]);
 237  		w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
 238  			+ MUL15(u[i + 1], v[i + 3])
 239  			+ MUL15(u[i + 2], v[i + 2])
 240  			+ MUL15(u[i + 3], v[i + 1])
 241  			+ MUL15(u[i + 4], v[i + 0]);
 242  		w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
 243  			+ MUL15(u[i + 2], v[i + 3])
 244  			+ MUL15(u[i + 3], v[i + 2])
 245  			+ MUL15(u[i + 4], v[i + 1]);
 246  		w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
 247  			+ MUL15(u[i + 3], v[i + 3])
 248  			+ MUL15(u[i + 4], v[i + 2]);
 249  		w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
 250  			+ MUL15(u[i + 4], v[i + 3]);
 251  		w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
 252  		w[(i << 1) + 9] = 0;
 253  	}
 254  
 255  	/*
 256  	 * For the 9th multiplication, source words are up to 32764,
 257  	 * so we must do some carry propagation. If we add up to
 258  	 * 4 products and the carry is no more than 524224, then the
 259  	 * result fits in 32 bits, and the next carry will be no more
 260  	 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
 261  	 *
 262  	 * We thus just skip one of the products in the middle word,
 263  	 * then do a carry propagation (this reduces words to 13 bits
 264  	 * each, except possibly the last, which may use up to 17 bits
 265  	 * or so), then add the missing product.
 266  	 */
 267  	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
 268  	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
 269  		+ MUL15(u[40 + 1], v[40 + 0]);
 270  	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
 271  		+ MUL15(u[40 + 1], v[40 + 1])
 272  		+ MUL15(u[40 + 2], v[40 + 0]);
 273  	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
 274  		+ MUL15(u[40 + 1], v[40 + 2])
 275  		+ MUL15(u[40 + 2], v[40 + 1])
 276  		+ MUL15(u[40 + 3], v[40 + 0]);
 277  	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
 278  		+ MUL15(u[40 + 1], v[40 + 3])
 279  		+ MUL15(u[40 + 2], v[40 + 2])
 280  		+ MUL15(u[40 + 3], v[40 + 1]);
 281  		/* + MUL15(u[40 + 4], v[40 + 0]) */
 282  	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
 283  		+ MUL15(u[40 + 2], v[40 + 3])
 284  		+ MUL15(u[40 + 3], v[40 + 2])
 285  		+ MUL15(u[40 + 4], v[40 + 1]);
 286  	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
 287  		+ MUL15(u[40 + 3], v[40 + 3])
 288  		+ MUL15(u[40 + 4], v[40 + 2]);
 289  	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
 290  		+ MUL15(u[40 + 4], v[40 + 3]);
 291  	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
 292  
 293  	CPR(w, 80);
 294  
 295  	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
 296  
 297  	/*
 298  	 * The products on 14-bit words in slots 6 and 7 yield values
 299  	 * up to 5*(16382^2) each, and we need to subtract two such
 300  	 * values from the higher word. We need the subtraction to fit
 301  	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
 302  	 * However, 10*(16382^2) does not fit. So we must perform a
 303  	 * bit of reduction here.
 304  	 */
 305  	CPR(w, 60);
 306  	CPR(w, 70);
 307  
 308  	/*
 309  	 * Recompose results.
 310  	 */
 311  
 312  	/* 0..1*0..1 into 0..3 */
 313  	ZSUB2F(w, 8, w, 0, w, 2);
 314  	ZSUB2F(w, 9, w, 1, w, 3);
 315  	ZADDT(w, 1, w, 8);
 316  	ZADDT(w, 2, w, 9);
 317  
 318  	/* 2..3*2..3 into 4..7 */
 319  	ZSUB2F(w, 10, w, 4, w, 6);
 320  	ZSUB2F(w, 11, w, 5, w, 7);
 321  	ZADDT(w, 5, w, 10);
 322  	ZADDT(w, 6, w, 11);
 323  
 324  	/* (0..1+2..3)*(0..1+2..3) into 12..15 */
 325  	ZSUB2F(w, 16, w, 12, w, 14);
 326  	ZSUB2F(w, 17, w, 13, w, 15);
 327  	ZADDT(w, 13, w, 16);
 328  	ZADDT(w, 14, w, 17);
 329  
 330  	/* first-level recomposition */
 331  	ZSUB2F(w, 12, w, 0, w, 4);
 332  	ZSUB2F(w, 13, w, 1, w, 5);
 333  	ZSUB2F(w, 14, w, 2, w, 6);
 334  	ZSUB2F(w, 15, w, 3, w, 7);
 335  	ZADDT(w, 2, w, 12);
 336  	ZADDT(w, 3, w, 13);
 337  	ZADDT(w, 4, w, 14);
 338  	ZADDT(w, 5, w, 15);
 339  
 340  	/*
 341  	 * Perform carry propagation to bring all words down to 13 bits.
 342  	 */
 343  	cc = norm13(d, w, 40);
 344  	d[39] += (cc << 13);
 345  
 346  #undef ZADD
 347  #undef ZADDT
 348  #undef ZSUB2F
 349  #undef CPR1
 350  #undef CPR
 351  }
 352  
 353  static inline void
 354  square20(uint32_t *d, const uint32_t *a)
 355  {
 356  	mul20(d, a, a);
 357  }
 358  
 359  #else
 360  
 361  static void
 362  mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
 363  {
 364  	uint32_t t[39];
 365  
 366  	t[ 0] = MUL15(a[ 0], b[ 0]);
 367  	t[ 1] = MUL15(a[ 0], b[ 1])
 368  		+ MUL15(a[ 1], b[ 0]);
 369  	t[ 2] = MUL15(a[ 0], b[ 2])
 370  		+ MUL15(a[ 1], b[ 1])
 371  		+ MUL15(a[ 2], b[ 0]);
 372  	t[ 3] = MUL15(a[ 0], b[ 3])
 373  		+ MUL15(a[ 1], b[ 2])
 374  		+ MUL15(a[ 2], b[ 1])
 375  		+ MUL15(a[ 3], b[ 0]);
 376  	t[ 4] = MUL15(a[ 0], b[ 4])
 377  		+ MUL15(a[ 1], b[ 3])
 378  		+ MUL15(a[ 2], b[ 2])
 379  		+ MUL15(a[ 3], b[ 1])
 380  		+ MUL15(a[ 4], b[ 0]);
 381  	t[ 5] = MUL15(a[ 0], b[ 5])
 382  		+ MUL15(a[ 1], b[ 4])
 383  		+ MUL15(a[ 2], b[ 3])
 384  		+ MUL15(a[ 3], b[ 2])
 385  		+ MUL15(a[ 4], b[ 1])
 386  		+ MUL15(a[ 5], b[ 0]);
 387  	t[ 6] = MUL15(a[ 0], b[ 6])
 388  		+ MUL15(a[ 1], b[ 5])
 389  		+ MUL15(a[ 2], b[ 4])
 390  		+ MUL15(a[ 3], b[ 3])
 391  		+ MUL15(a[ 4], b[ 2])
 392  		+ MUL15(a[ 5], b[ 1])
 393  		+ MUL15(a[ 6], b[ 0]);
 394  	t[ 7] = MUL15(a[ 0], b[ 7])
 395  		+ MUL15(a[ 1], b[ 6])
 396  		+ MUL15(a[ 2], b[ 5])
 397  		+ MUL15(a[ 3], b[ 4])
 398  		+ MUL15(a[ 4], b[ 3])
 399  		+ MUL15(a[ 5], b[ 2])
 400  		+ MUL15(a[ 6], b[ 1])
 401  		+ MUL15(a[ 7], b[ 0]);
 402  	t[ 8] = MUL15(a[ 0], b[ 8])
 403  		+ MUL15(a[ 1], b[ 7])
 404  		+ MUL15(a[ 2], b[ 6])
 405  		+ MUL15(a[ 3], b[ 5])
 406  		+ MUL15(a[ 4], b[ 4])
 407  		+ MUL15(a[ 5], b[ 3])
 408  		+ MUL15(a[ 6], b[ 2])
 409  		+ MUL15(a[ 7], b[ 1])
 410  		+ MUL15(a[ 8], b[ 0]);
 411  	t[ 9] = MUL15(a[ 0], b[ 9])
 412  		+ MUL15(a[ 1], b[ 8])
 413  		+ MUL15(a[ 2], b[ 7])
 414  		+ MUL15(a[ 3], b[ 6])
 415  		+ MUL15(a[ 4], b[ 5])
 416  		+ MUL15(a[ 5], b[ 4])
 417  		+ MUL15(a[ 6], b[ 3])
 418  		+ MUL15(a[ 7], b[ 2])
 419  		+ MUL15(a[ 8], b[ 1])
 420  		+ MUL15(a[ 9], b[ 0]);
 421  	t[10] = MUL15(a[ 0], b[10])
 422  		+ MUL15(a[ 1], b[ 9])
 423  		+ MUL15(a[ 2], b[ 8])
 424  		+ MUL15(a[ 3], b[ 7])
 425  		+ MUL15(a[ 4], b[ 6])
 426  		+ MUL15(a[ 5], b[ 5])
 427  		+ MUL15(a[ 6], b[ 4])
 428  		+ MUL15(a[ 7], b[ 3])
 429  		+ MUL15(a[ 8], b[ 2])
 430  		+ MUL15(a[ 9], b[ 1])
 431  		+ MUL15(a[10], b[ 0]);
 432  	t[11] = MUL15(a[ 0], b[11])
 433  		+ MUL15(a[ 1], b[10])
 434  		+ MUL15(a[ 2], b[ 9])
 435  		+ MUL15(a[ 3], b[ 8])
 436  		+ MUL15(a[ 4], b[ 7])
 437  		+ MUL15(a[ 5], b[ 6])
 438  		+ MUL15(a[ 6], b[ 5])
 439  		+ MUL15(a[ 7], b[ 4])
 440  		+ MUL15(a[ 8], b[ 3])
 441  		+ MUL15(a[ 9], b[ 2])
 442  		+ MUL15(a[10], b[ 1])
 443  		+ MUL15(a[11], b[ 0]);
 444  	t[12] = MUL15(a[ 0], b[12])
 445  		+ MUL15(a[ 1], b[11])
 446  		+ MUL15(a[ 2], b[10])
 447  		+ MUL15(a[ 3], b[ 9])
 448  		+ MUL15(a[ 4], b[ 8])
 449  		+ MUL15(a[ 5], b[ 7])
 450  		+ MUL15(a[ 6], b[ 6])
 451  		+ MUL15(a[ 7], b[ 5])
 452  		+ MUL15(a[ 8], b[ 4])
 453  		+ MUL15(a[ 9], b[ 3])
 454  		+ MUL15(a[10], b[ 2])
 455  		+ MUL15(a[11], b[ 1])
 456  		+ MUL15(a[12], b[ 0]);
 457  	t[13] = MUL15(a[ 0], b[13])
 458  		+ MUL15(a[ 1], b[12])
 459  		+ MUL15(a[ 2], b[11])
 460  		+ MUL15(a[ 3], b[10])
 461  		+ MUL15(a[ 4], b[ 9])
 462  		+ MUL15(a[ 5], b[ 8])
 463  		+ MUL15(a[ 6], b[ 7])
 464  		+ MUL15(a[ 7], b[ 6])
 465  		+ MUL15(a[ 8], b[ 5])
 466  		+ MUL15(a[ 9], b[ 4])
 467  		+ MUL15(a[10], b[ 3])
 468  		+ MUL15(a[11], b[ 2])
 469  		+ MUL15(a[12], b[ 1])
 470  		+ MUL15(a[13], b[ 0]);
 471  	t[14] = MUL15(a[ 0], b[14])
 472  		+ MUL15(a[ 1], b[13])
 473  		+ MUL15(a[ 2], b[12])
 474  		+ MUL15(a[ 3], b[11])
 475  		+ MUL15(a[ 4], b[10])
 476  		+ MUL15(a[ 5], b[ 9])
 477  		+ MUL15(a[ 6], b[ 8])
 478  		+ MUL15(a[ 7], b[ 7])
 479  		+ MUL15(a[ 8], b[ 6])
 480  		+ MUL15(a[ 9], b[ 5])
 481  		+ MUL15(a[10], b[ 4])
 482  		+ MUL15(a[11], b[ 3])
 483  		+ MUL15(a[12], b[ 2])
 484  		+ MUL15(a[13], b[ 1])
 485  		+ MUL15(a[14], b[ 0]);
 486  	t[15] = MUL15(a[ 0], b[15])
 487  		+ MUL15(a[ 1], b[14])
 488  		+ MUL15(a[ 2], b[13])
 489  		+ MUL15(a[ 3], b[12])
 490  		+ MUL15(a[ 4], b[11])
 491  		+ MUL15(a[ 5], b[10])
 492  		+ MUL15(a[ 6], b[ 9])
 493  		+ MUL15(a[ 7], b[ 8])
 494  		+ MUL15(a[ 8], b[ 7])
 495  		+ MUL15(a[ 9], b[ 6])
 496  		+ MUL15(a[10], b[ 5])
 497  		+ MUL15(a[11], b[ 4])
 498  		+ MUL15(a[12], b[ 3])
 499  		+ MUL15(a[13], b[ 2])
 500  		+ MUL15(a[14], b[ 1])
 501  		+ MUL15(a[15], b[ 0]);
 502  	t[16] = MUL15(a[ 0], b[16])
 503  		+ MUL15(a[ 1], b[15])
 504  		+ MUL15(a[ 2], b[14])
 505  		+ MUL15(a[ 3], b[13])
 506  		+ MUL15(a[ 4], b[12])
 507  		+ MUL15(a[ 5], b[11])
 508  		+ MUL15(a[ 6], b[10])
 509  		+ MUL15(a[ 7], b[ 9])
 510  		+ MUL15(a[ 8], b[ 8])
 511  		+ MUL15(a[ 9], b[ 7])
 512  		+ MUL15(a[10], b[ 6])
 513  		+ MUL15(a[11], b[ 5])
 514  		+ MUL15(a[12], b[ 4])
 515  		+ MUL15(a[13], b[ 3])
 516  		+ MUL15(a[14], b[ 2])
 517  		+ MUL15(a[15], b[ 1])
 518  		+ MUL15(a[16], b[ 0]);
 519  	t[17] = MUL15(a[ 0], b[17])
 520  		+ MUL15(a[ 1], b[16])
 521  		+ MUL15(a[ 2], b[15])
 522  		+ MUL15(a[ 3], b[14])
 523  		+ MUL15(a[ 4], b[13])
 524  		+ MUL15(a[ 5], b[12])
 525  		+ MUL15(a[ 6], b[11])
 526  		+ MUL15(a[ 7], b[10])
 527  		+ MUL15(a[ 8], b[ 9])
 528  		+ MUL15(a[ 9], b[ 8])
 529  		+ MUL15(a[10], b[ 7])
 530  		+ MUL15(a[11], b[ 6])
 531  		+ MUL15(a[12], b[ 5])
 532  		+ MUL15(a[13], b[ 4])
 533  		+ MUL15(a[14], b[ 3])
 534  		+ MUL15(a[15], b[ 2])
 535  		+ MUL15(a[16], b[ 1])
 536  		+ MUL15(a[17], b[ 0]);
 537  	t[18] = MUL15(a[ 0], b[18])
 538  		+ MUL15(a[ 1], b[17])
 539  		+ MUL15(a[ 2], b[16])
 540  		+ MUL15(a[ 3], b[15])
 541  		+ MUL15(a[ 4], b[14])
 542  		+ MUL15(a[ 5], b[13])
 543  		+ MUL15(a[ 6], b[12])
 544  		+ MUL15(a[ 7], b[11])
 545  		+ MUL15(a[ 8], b[10])
 546  		+ MUL15(a[ 9], b[ 9])
 547  		+ MUL15(a[10], b[ 8])
 548  		+ MUL15(a[11], b[ 7])
 549  		+ MUL15(a[12], b[ 6])
 550  		+ MUL15(a[13], b[ 5])
 551  		+ MUL15(a[14], b[ 4])
 552  		+ MUL15(a[15], b[ 3])
 553  		+ MUL15(a[16], b[ 2])
 554  		+ MUL15(a[17], b[ 1])
 555  		+ MUL15(a[18], b[ 0]);
 556  	t[19] = MUL15(a[ 0], b[19])
 557  		+ MUL15(a[ 1], b[18])
 558  		+ MUL15(a[ 2], b[17])
 559  		+ MUL15(a[ 3], b[16])
 560  		+ MUL15(a[ 4], b[15])
 561  		+ MUL15(a[ 5], b[14])
 562  		+ MUL15(a[ 6], b[13])
 563  		+ MUL15(a[ 7], b[12])
 564  		+ MUL15(a[ 8], b[11])
 565  		+ MUL15(a[ 9], b[10])
 566  		+ MUL15(a[10], b[ 9])
 567  		+ MUL15(a[11], b[ 8])
 568  		+ MUL15(a[12], b[ 7])
 569  		+ MUL15(a[13], b[ 6])
 570  		+ MUL15(a[14], b[ 5])
 571  		+ MUL15(a[15], b[ 4])
 572  		+ MUL15(a[16], b[ 3])
 573  		+ MUL15(a[17], b[ 2])
 574  		+ MUL15(a[18], b[ 1])
 575  		+ MUL15(a[19], b[ 0]);
 576  	t[20] = MUL15(a[ 1], b[19])
 577  		+ MUL15(a[ 2], b[18])
 578  		+ MUL15(a[ 3], b[17])
 579  		+ MUL15(a[ 4], b[16])
 580  		+ MUL15(a[ 5], b[15])
 581  		+ MUL15(a[ 6], b[14])
 582  		+ MUL15(a[ 7], b[13])
 583  		+ MUL15(a[ 8], b[12])
 584  		+ MUL15(a[ 9], b[11])
 585  		+ MUL15(a[10], b[10])
 586  		+ MUL15(a[11], b[ 9])
 587  		+ MUL15(a[12], b[ 8])
 588  		+ MUL15(a[13], b[ 7])
 589  		+ MUL15(a[14], b[ 6])
 590  		+ MUL15(a[15], b[ 5])
 591  		+ MUL15(a[16], b[ 4])
 592  		+ MUL15(a[17], b[ 3])
 593  		+ MUL15(a[18], b[ 2])
 594  		+ MUL15(a[19], b[ 1]);
 595  	t[21] = MUL15(a[ 2], b[19])
 596  		+ MUL15(a[ 3], b[18])
 597  		+ MUL15(a[ 4], b[17])
 598  		+ MUL15(a[ 5], b[16])
 599  		+ MUL15(a[ 6], b[15])
 600  		+ MUL15(a[ 7], b[14])
 601  		+ MUL15(a[ 8], b[13])
 602  		+ MUL15(a[ 9], b[12])
 603  		+ MUL15(a[10], b[11])
 604  		+ MUL15(a[11], b[10])
 605  		+ MUL15(a[12], b[ 9])
 606  		+ MUL15(a[13], b[ 8])
 607  		+ MUL15(a[14], b[ 7])
 608  		+ MUL15(a[15], b[ 6])
 609  		+ MUL15(a[16], b[ 5])
 610  		+ MUL15(a[17], b[ 4])
 611  		+ MUL15(a[18], b[ 3])
 612  		+ MUL15(a[19], b[ 2]);
 613  	t[22] = MUL15(a[ 3], b[19])
 614  		+ MUL15(a[ 4], b[18])
 615  		+ MUL15(a[ 5], b[17])
 616  		+ MUL15(a[ 6], b[16])
 617  		+ MUL15(a[ 7], b[15])
 618  		+ MUL15(a[ 8], b[14])
 619  		+ MUL15(a[ 9], b[13])
 620  		+ MUL15(a[10], b[12])
 621  		+ MUL15(a[11], b[11])
 622  		+ MUL15(a[12], b[10])
 623  		+ MUL15(a[13], b[ 9])
 624  		+ MUL15(a[14], b[ 8])
 625  		+ MUL15(a[15], b[ 7])
 626  		+ MUL15(a[16], b[ 6])
 627  		+ MUL15(a[17], b[ 5])
 628  		+ MUL15(a[18], b[ 4])
 629  		+ MUL15(a[19], b[ 3]);
 630  	t[23] = MUL15(a[ 4], b[19])
 631  		+ MUL15(a[ 5], b[18])
 632  		+ MUL15(a[ 6], b[17])
 633  		+ MUL15(a[ 7], b[16])
 634  		+ MUL15(a[ 8], b[15])
 635  		+ MUL15(a[ 9], b[14])
 636  		+ MUL15(a[10], b[13])
 637  		+ MUL15(a[11], b[12])
 638  		+ MUL15(a[12], b[11])
 639  		+ MUL15(a[13], b[10])
 640  		+ MUL15(a[14], b[ 9])
 641  		+ MUL15(a[15], b[ 8])
 642  		+ MUL15(a[16], b[ 7])
 643  		+ MUL15(a[17], b[ 6])
 644  		+ MUL15(a[18], b[ 5])
 645  		+ MUL15(a[19], b[ 4]);
 646  	t[24] = MUL15(a[ 5], b[19])
 647  		+ MUL15(a[ 6], b[18])
 648  		+ MUL15(a[ 7], b[17])
 649  		+ MUL15(a[ 8], b[16])
 650  		+ MUL15(a[ 9], b[15])
 651  		+ MUL15(a[10], b[14])
 652  		+ MUL15(a[11], b[13])
 653  		+ MUL15(a[12], b[12])
 654  		+ MUL15(a[13], b[11])
 655  		+ MUL15(a[14], b[10])
 656  		+ MUL15(a[15], b[ 9])
 657  		+ MUL15(a[16], b[ 8])
 658  		+ MUL15(a[17], b[ 7])
 659  		+ MUL15(a[18], b[ 6])
 660  		+ MUL15(a[19], b[ 5]);
 661  	t[25] = MUL15(a[ 6], b[19])
 662  		+ MUL15(a[ 7], b[18])
 663  		+ MUL15(a[ 8], b[17])
 664  		+ MUL15(a[ 9], b[16])
 665  		+ MUL15(a[10], b[15])
 666  		+ MUL15(a[11], b[14])
 667  		+ MUL15(a[12], b[13])
 668  		+ MUL15(a[13], b[12])
 669  		+ MUL15(a[14], b[11])
 670  		+ MUL15(a[15], b[10])
 671  		+ MUL15(a[16], b[ 9])
 672  		+ MUL15(a[17], b[ 8])
 673  		+ MUL15(a[18], b[ 7])
 674  		+ MUL15(a[19], b[ 6]);
 675  	t[26] = MUL15(a[ 7], b[19])
 676  		+ MUL15(a[ 8], b[18])
 677  		+ MUL15(a[ 9], b[17])
 678  		+ MUL15(a[10], b[16])
 679  		+ MUL15(a[11], b[15])
 680  		+ MUL15(a[12], b[14])
 681  		+ MUL15(a[13], b[13])
 682  		+ MUL15(a[14], b[12])
 683  		+ MUL15(a[15], b[11])
 684  		+ MUL15(a[16], b[10])
 685  		+ MUL15(a[17], b[ 9])
 686  		+ MUL15(a[18], b[ 8])
 687  		+ MUL15(a[19], b[ 7]);
 688  	t[27] = MUL15(a[ 8], b[19])
 689  		+ MUL15(a[ 9], b[18])
 690  		+ MUL15(a[10], b[17])
 691  		+ MUL15(a[11], b[16])
 692  		+ MUL15(a[12], b[15])
 693  		+ MUL15(a[13], b[14])
 694  		+ MUL15(a[14], b[13])
 695  		+ MUL15(a[15], b[12])
 696  		+ MUL15(a[16], b[11])
 697  		+ MUL15(a[17], b[10])
 698  		+ MUL15(a[18], b[ 9])
 699  		+ MUL15(a[19], b[ 8]);
 700  	t[28] = MUL15(a[ 9], b[19])
 701  		+ MUL15(a[10], b[18])
 702  		+ MUL15(a[11], b[17])
 703  		+ MUL15(a[12], b[16])
 704  		+ MUL15(a[13], b[15])
 705  		+ MUL15(a[14], b[14])
 706  		+ MUL15(a[15], b[13])
 707  		+ MUL15(a[16], b[12])
 708  		+ MUL15(a[17], b[11])
 709  		+ MUL15(a[18], b[10])
 710  		+ MUL15(a[19], b[ 9]);
 711  	t[29] = MUL15(a[10], b[19])
 712  		+ MUL15(a[11], b[18])
 713  		+ MUL15(a[12], b[17])
 714  		+ MUL15(a[13], b[16])
 715  		+ MUL15(a[14], b[15])
 716  		+ MUL15(a[15], b[14])
 717  		+ MUL15(a[16], b[13])
 718  		+ MUL15(a[17], b[12])
 719  		+ MUL15(a[18], b[11])
 720  		+ MUL15(a[19], b[10]);
 721  	t[30] = MUL15(a[11], b[19])
 722  		+ MUL15(a[12], b[18])
 723  		+ MUL15(a[13], b[17])
 724  		+ MUL15(a[14], b[16])
 725  		+ MUL15(a[15], b[15])
 726  		+ MUL15(a[16], b[14])
 727  		+ MUL15(a[17], b[13])
 728  		+ MUL15(a[18], b[12])
 729  		+ MUL15(a[19], b[11]);
 730  	t[31] = MUL15(a[12], b[19])
 731  		+ MUL15(a[13], b[18])
 732  		+ MUL15(a[14], b[17])
 733  		+ MUL15(a[15], b[16])
 734  		+ MUL15(a[16], b[15])
 735  		+ MUL15(a[17], b[14])
 736  		+ MUL15(a[18], b[13])
 737  		+ MUL15(a[19], b[12]);
 738  	t[32] = MUL15(a[13], b[19])
 739  		+ MUL15(a[14], b[18])
 740  		+ MUL15(a[15], b[17])
 741  		+ MUL15(a[16], b[16])
 742  		+ MUL15(a[17], b[15])
 743  		+ MUL15(a[18], b[14])
 744  		+ MUL15(a[19], b[13]);
 745  	t[33] = MUL15(a[14], b[19])
 746  		+ MUL15(a[15], b[18])
 747  		+ MUL15(a[16], b[17])
 748  		+ MUL15(a[17], b[16])
 749  		+ MUL15(a[18], b[15])
 750  		+ MUL15(a[19], b[14]);
 751  	t[34] = MUL15(a[15], b[19])
 752  		+ MUL15(a[16], b[18])
 753  		+ MUL15(a[17], b[17])
 754  		+ MUL15(a[18], b[16])
 755  		+ MUL15(a[19], b[15]);
 756  	t[35] = MUL15(a[16], b[19])
 757  		+ MUL15(a[17], b[18])
 758  		+ MUL15(a[18], b[17])
 759  		+ MUL15(a[19], b[16]);
 760  	t[36] = MUL15(a[17], b[19])
 761  		+ MUL15(a[18], b[18])
 762  		+ MUL15(a[19], b[17]);
 763  	t[37] = MUL15(a[18], b[19])
 764  		+ MUL15(a[19], b[18]);
 765  	t[38] = MUL15(a[19], b[19]);
 766  	d[39] = norm13(d, t, 39);
 767  }
 768  
 769  static void
 770  square20(uint32_t *d, const uint32_t *a)
 771  {
 772  	uint32_t t[39];
 773  
 774  	t[ 0] = MUL15(a[ 0], a[ 0]);
 775  	t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
 776  	t[ 2] = MUL15(a[ 1], a[ 1])
 777  		+ ((MUL15(a[ 0], a[ 2])) << 1);
 778  	t[ 3] = ((MUL15(a[ 0], a[ 3])
 779  		+ MUL15(a[ 1], a[ 2])) << 1);
 780  	t[ 4] = MUL15(a[ 2], a[ 2])
 781  		+ ((MUL15(a[ 0], a[ 4])
 782  		+ MUL15(a[ 1], a[ 3])) << 1);
 783  	t[ 5] = ((MUL15(a[ 0], a[ 5])
 784  		+ MUL15(a[ 1], a[ 4])
 785  		+ MUL15(a[ 2], a[ 3])) << 1);
 786  	t[ 6] = MUL15(a[ 3], a[ 3])
 787  		+ ((MUL15(a[ 0], a[ 6])
 788  		+ MUL15(a[ 1], a[ 5])
 789  		+ MUL15(a[ 2], a[ 4])) << 1);
 790  	t[ 7] = ((MUL15(a[ 0], a[ 7])
 791  		+ MUL15(a[ 1], a[ 6])
 792  		+ MUL15(a[ 2], a[ 5])
 793  		+ MUL15(a[ 3], a[ 4])) << 1);
 794  	t[ 8] = MUL15(a[ 4], a[ 4])
 795  		+ ((MUL15(a[ 0], a[ 8])
 796  		+ MUL15(a[ 1], a[ 7])
 797  		+ MUL15(a[ 2], a[ 6])
 798  		+ MUL15(a[ 3], a[ 5])) << 1);
 799  	t[ 9] = ((MUL15(a[ 0], a[ 9])
 800  		+ MUL15(a[ 1], a[ 8])
 801  		+ MUL15(a[ 2], a[ 7])
 802  		+ MUL15(a[ 3], a[ 6])
 803  		+ MUL15(a[ 4], a[ 5])) << 1);
 804  	t[10] = MUL15(a[ 5], a[ 5])
 805  		+ ((MUL15(a[ 0], a[10])
 806  		+ MUL15(a[ 1], a[ 9])
 807  		+ MUL15(a[ 2], a[ 8])
 808  		+ MUL15(a[ 3], a[ 7])
 809  		+ MUL15(a[ 4], a[ 6])) << 1);
 810  	t[11] = ((MUL15(a[ 0], a[11])
 811  		+ MUL15(a[ 1], a[10])
 812  		+ MUL15(a[ 2], a[ 9])
 813  		+ MUL15(a[ 3], a[ 8])
 814  		+ MUL15(a[ 4], a[ 7])
 815  		+ MUL15(a[ 5], a[ 6])) << 1);
 816  	t[12] = MUL15(a[ 6], a[ 6])
 817  		+ ((MUL15(a[ 0], a[12])
 818  		+ MUL15(a[ 1], a[11])
 819  		+ MUL15(a[ 2], a[10])
 820  		+ MUL15(a[ 3], a[ 9])
 821  		+ MUL15(a[ 4], a[ 8])
 822  		+ MUL15(a[ 5], a[ 7])) << 1);
 823  	t[13] = ((MUL15(a[ 0], a[13])
 824  		+ MUL15(a[ 1], a[12])
 825  		+ MUL15(a[ 2], a[11])
 826  		+ MUL15(a[ 3], a[10])
 827  		+ MUL15(a[ 4], a[ 9])
 828  		+ MUL15(a[ 5], a[ 8])
 829  		+ MUL15(a[ 6], a[ 7])) << 1);
 830  	t[14] = MUL15(a[ 7], a[ 7])
 831  		+ ((MUL15(a[ 0], a[14])
 832  		+ MUL15(a[ 1], a[13])
 833  		+ MUL15(a[ 2], a[12])
 834  		+ MUL15(a[ 3], a[11])
 835  		+ MUL15(a[ 4], a[10])
 836  		+ MUL15(a[ 5], a[ 9])
 837  		+ MUL15(a[ 6], a[ 8])) << 1);
 838  	t[15] = ((MUL15(a[ 0], a[15])
 839  		+ MUL15(a[ 1], a[14])
 840  		+ MUL15(a[ 2], a[13])
 841  		+ MUL15(a[ 3], a[12])
 842  		+ MUL15(a[ 4], a[11])
 843  		+ MUL15(a[ 5], a[10])
 844  		+ MUL15(a[ 6], a[ 9])
 845  		+ MUL15(a[ 7], a[ 8])) << 1);
 846  	t[16] = MUL15(a[ 8], a[ 8])
 847  		+ ((MUL15(a[ 0], a[16])
 848  		+ MUL15(a[ 1], a[15])
 849  		+ MUL15(a[ 2], a[14])
 850  		+ MUL15(a[ 3], a[13])
 851  		+ MUL15(a[ 4], a[12])
 852  		+ MUL15(a[ 5], a[11])
 853  		+ MUL15(a[ 6], a[10])
 854  		+ MUL15(a[ 7], a[ 9])) << 1);
 855  	t[17] = ((MUL15(a[ 0], a[17])
 856  		+ MUL15(a[ 1], a[16])
 857  		+ MUL15(a[ 2], a[15])
 858  		+ MUL15(a[ 3], a[14])
 859  		+ MUL15(a[ 4], a[13])
 860  		+ MUL15(a[ 5], a[12])
 861  		+ MUL15(a[ 6], a[11])
 862  		+ MUL15(a[ 7], a[10])
 863  		+ MUL15(a[ 8], a[ 9])) << 1);
 864  	t[18] = MUL15(a[ 9], a[ 9])
 865  		+ ((MUL15(a[ 0], a[18])
 866  		+ MUL15(a[ 1], a[17])
 867  		+ MUL15(a[ 2], a[16])
 868  		+ MUL15(a[ 3], a[15])
 869  		+ MUL15(a[ 4], a[14])
 870  		+ MUL15(a[ 5], a[13])
 871  		+ MUL15(a[ 6], a[12])
 872  		+ MUL15(a[ 7], a[11])
 873  		+ MUL15(a[ 8], a[10])) << 1);
 874  	t[19] = ((MUL15(a[ 0], a[19])
 875  		+ MUL15(a[ 1], a[18])
 876  		+ MUL15(a[ 2], a[17])
 877  		+ MUL15(a[ 3], a[16])
 878  		+ MUL15(a[ 4], a[15])
 879  		+ MUL15(a[ 5], a[14])
 880  		+ MUL15(a[ 6], a[13])
 881  		+ MUL15(a[ 7], a[12])
 882  		+ MUL15(a[ 8], a[11])
 883  		+ MUL15(a[ 9], a[10])) << 1);
 884  	t[20] = MUL15(a[10], a[10])
 885  		+ ((MUL15(a[ 1], a[19])
 886  		+ MUL15(a[ 2], a[18])
 887  		+ MUL15(a[ 3], a[17])
 888  		+ MUL15(a[ 4], a[16])
 889  		+ MUL15(a[ 5], a[15])
 890  		+ MUL15(a[ 6], a[14])
 891  		+ MUL15(a[ 7], a[13])
 892  		+ MUL15(a[ 8], a[12])
 893  		+ MUL15(a[ 9], a[11])) << 1);
 894  	t[21] = ((MUL15(a[ 2], a[19])
 895  		+ MUL15(a[ 3], a[18])
 896  		+ MUL15(a[ 4], a[17])
 897  		+ MUL15(a[ 5], a[16])
 898  		+ MUL15(a[ 6], a[15])
 899  		+ MUL15(a[ 7], a[14])
 900  		+ MUL15(a[ 8], a[13])
 901  		+ MUL15(a[ 9], a[12])
 902  		+ MUL15(a[10], a[11])) << 1);
 903  	t[22] = MUL15(a[11], a[11])
 904  		+ ((MUL15(a[ 3], a[19])
 905  		+ MUL15(a[ 4], a[18])
 906  		+ MUL15(a[ 5], a[17])
 907  		+ MUL15(a[ 6], a[16])
 908  		+ MUL15(a[ 7], a[15])
 909  		+ MUL15(a[ 8], a[14])
 910  		+ MUL15(a[ 9], a[13])
 911  		+ MUL15(a[10], a[12])) << 1);
 912  	t[23] = ((MUL15(a[ 4], a[19])
 913  		+ MUL15(a[ 5], a[18])
 914  		+ MUL15(a[ 6], a[17])
 915  		+ MUL15(a[ 7], a[16])
 916  		+ MUL15(a[ 8], a[15])
 917  		+ MUL15(a[ 9], a[14])
 918  		+ MUL15(a[10], a[13])
 919  		+ MUL15(a[11], a[12])) << 1);
 920  	t[24] = MUL15(a[12], a[12])
 921  		+ ((MUL15(a[ 5], a[19])
 922  		+ MUL15(a[ 6], a[18])
 923  		+ MUL15(a[ 7], a[17])
 924  		+ MUL15(a[ 8], a[16])
 925  		+ MUL15(a[ 9], a[15])
 926  		+ MUL15(a[10], a[14])
 927  		+ MUL15(a[11], a[13])) << 1);
 928  	t[25] = ((MUL15(a[ 6], a[19])
 929  		+ MUL15(a[ 7], a[18])
 930  		+ MUL15(a[ 8], a[17])
 931  		+ MUL15(a[ 9], a[16])
 932  		+ MUL15(a[10], a[15])
 933  		+ MUL15(a[11], a[14])
 934  		+ MUL15(a[12], a[13])) << 1);
 935  	t[26] = MUL15(a[13], a[13])
 936  		+ ((MUL15(a[ 7], a[19])
 937  		+ MUL15(a[ 8], a[18])
 938  		+ MUL15(a[ 9], a[17])
 939  		+ MUL15(a[10], a[16])
 940  		+ MUL15(a[11], a[15])
 941  		+ MUL15(a[12], a[14])) << 1);
 942  	t[27] = ((MUL15(a[ 8], a[19])
 943  		+ MUL15(a[ 9], a[18])
 944  		+ MUL15(a[10], a[17])
 945  		+ MUL15(a[11], a[16])
 946  		+ MUL15(a[12], a[15])
 947  		+ MUL15(a[13], a[14])) << 1);
 948  	t[28] = MUL15(a[14], a[14])
 949  		+ ((MUL15(a[ 9], a[19])
 950  		+ MUL15(a[10], a[18])
 951  		+ MUL15(a[11], a[17])
 952  		+ MUL15(a[12], a[16])
 953  		+ MUL15(a[13], a[15])) << 1);
 954  	t[29] = ((MUL15(a[10], a[19])
 955  		+ MUL15(a[11], a[18])
 956  		+ MUL15(a[12], a[17])
 957  		+ MUL15(a[13], a[16])
 958  		+ MUL15(a[14], a[15])) << 1);
 959  	t[30] = MUL15(a[15], a[15])
 960  		+ ((MUL15(a[11], a[19])
 961  		+ MUL15(a[12], a[18])
 962  		+ MUL15(a[13], a[17])
 963  		+ MUL15(a[14], a[16])) << 1);
 964  	t[31] = ((MUL15(a[12], a[19])
 965  		+ MUL15(a[13], a[18])
 966  		+ MUL15(a[14], a[17])
 967  		+ MUL15(a[15], a[16])) << 1);
 968  	t[32] = MUL15(a[16], a[16])
 969  		+ ((MUL15(a[13], a[19])
 970  		+ MUL15(a[14], a[18])
 971  		+ MUL15(a[15], a[17])) << 1);
 972  	t[33] = ((MUL15(a[14], a[19])
 973  		+ MUL15(a[15], a[18])
 974  		+ MUL15(a[16], a[17])) << 1);
 975  	t[34] = MUL15(a[17], a[17])
 976  		+ ((MUL15(a[15], a[19])
 977  		+ MUL15(a[16], a[18])) << 1);
 978  	t[35] = ((MUL15(a[16], a[19])
 979  		+ MUL15(a[17], a[18])) << 1);
 980  	t[36] = MUL15(a[18], a[18])
 981  		+ ((MUL15(a[17], a[19])) << 1);
 982  	t[37] = ((MUL15(a[18], a[19])) << 1);
 983  	t[38] = MUL15(a[19], a[19]);
 984  	d[39] = norm13(d, t, 39);
 985  }
 986  
 987  #endif
 988  
 989  /*
 990   * Modulus for field F256 (field for point coordinates in curve P-256).
 991   */
 992  static const uint32_t F256[] = {
 993  	0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
 994  	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
 995  	0x0000, 0x1FF8, 0x1FFF, 0x01FF
 996  };
 997  
 998  /*
 999   * The 'b' curve equation coefficient for P-256.
1000   */
1001  static const uint32_t P256_B[] = {
1002  	0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
1003  	0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
1004  	0x0A3A, 0x0EC5, 0x118D, 0x00B5
1005  };
1006  
1007  /*
1008   * Perform a "short reduction" in field F256 (field for curve P-256).
1009   * The source value should be less than 262 bits; on output, it will
1010   * be at most 257 bits, and less than twice the modulus.
1011   */
1012  static void
1013  reduce_f256(uint32_t *d)
1014  {
1015  	uint32_t x;
1016  
1017  	x = d[19] >> 9;
1018  	d[19] &= 0x01FF;
1019  	d[17] += x << 3;
1020  	d[14] -= x << 10;
1021  	d[7] -= x << 5;
1022  	d[0] += x;
1023  	norm13(d, d, 20);
1024  }
1025  
1026  /*
1027   * Perform a "final reduction" in field F256 (field for curve P-256).
1028   * The source value must be less than twice the modulus. If the value
1029   * is not lower than the modulus, then the modulus is subtracted and
1030   * this function returns 1; otherwise, it leaves it untouched and it
1031   * returns 0.
1032   */
1033  static uint32_t
1034  reduce_final_f256(uint32_t *d)
1035  {
1036  	uint32_t t[20];
1037  	uint32_t cc;
1038  	int i;
1039  
1040  	memcpy(t, d, sizeof t);
1041  	cc = 0;
1042  	for (i = 0; i < 20; i ++) {
1043  		uint32_t w;
1044  
1045  		w = t[i] - F256[i] - cc;
1046  		cc = w >> 31;
1047  		t[i] = w & 0x1FFF;
1048  	}
1049  	cc ^= 1;
1050  	CCOPY(cc, d, t, sizeof t);
1051  	return cc;
1052  }
1053  
1054  /*
1055   * Perform a multiplication of two integers modulo
1056   * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057   * of 20 words, each containing 13 bits of data, in little-endian order.
1058   * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1059   * on output, value fits on 257 bits and is lower than twice the modulus.
1060   */
1061  static void
1062  mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
1063  {
1064  	uint32_t t[40], cc;
1065  	int i;
1066  
1067  	/*
1068  	 * Compute raw multiplication. All result words fit in 13 bits
1069  	 * each.
1070  	 */
1071  	mul20(t, a, b);
1072  
1073  	/*
1074  	 * Modular reduction: each high word in added/subtracted where
1075  	 * necessary.
1076  	 *
1077  	 * The modulus is:
1078  	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1079  	 * Therefore:
1080  	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1081  	 *
1082  	 * For a word x at bit offset n (n >= 256), we have:
1083  	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
1084  	 *            - x*2^(n - 160) + x*2^(n-256) mod p
1085  	 *
1086  	 * Thus, we can nullify the high word if we reinject it at some
1087  	 * proper emplacements.
1088  	 */
1089  	for (i = 39; i >= 20; i --) {
1090  		uint32_t x;
1091  
1092  		x = t[i];
1093  		t[i - 2] += ARSH(x, 6);
1094  		t[i - 3] += (x << 7) & 0x1FFF;
1095  		t[i - 4] -= ARSH(x, 12);
1096  		t[i - 5] -= (x << 1) & 0x1FFF;
1097  		t[i - 12] -= ARSH(x, 4);
1098  		t[i - 13] -= (x << 9) & 0x1FFF;
1099  		t[i - 19] += ARSH(x, 9);
1100  		t[i - 20] += (x << 4) & 0x1FFF;
1101  	}
1102  
1103  	/*
1104  	 * Propagate carries. This is a signed propagation, and the
1105  	 * result may be negative. The loop above may enlarge values,
1106  	 * but not two much: worst case is the chain involving t[i - 3],
1107  	 * in which a value may be added to itself up to 7 times. Since
1108  	 * starting values are 13-bit each, all words fit on 20 bits
1109  	 * (21 to account for the sign bit).
1110  	 */
1111  	cc = norm13(t, t, 20);
1112  
1113  	/*
1114  	 * Perform modular reduction again for the bits beyond 256 (the carry
1115  	 * and the bits 256..259). Since the largest shift below is by 10
1116  	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1117  	 * thereby allowing injecting full word values.
1118  	 */
1119  	cc = (cc << 4) | (t[19] >> 9);
1120  	t[19] &= 0x01FF;
1121  	t[17] += cc << 3;
1122  	t[14] -= cc << 10;
1123  	t[7] -= cc << 5;
1124  	t[0] += cc;
1125  
1126  	/*
1127  	 * If the carry is negative, then after carry propagation, we may
1128  	 * end up with a value which is negative, and we don't want that.
1129  	 * Thus, in that case, we add the modulus. Note that the subtraction
1130  	 * result, when the carry is negative, is always smaller than the
1131  	 * modulus, so the extra addition will not make the value exceed
1132  	 * twice the modulus.
1133  	 */
1134  	cc >>= 31;
1135  	t[0] -= cc;
1136  	t[7] += cc << 5;
1137  	t[14] += cc << 10;
1138  	t[17] -= cc << 3;
1139  	t[19] += cc << 9;
1140  
1141  	norm13(d, t, 20);
1142  }
1143  
1144  /*
1145   * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1146   * P-256). Operand is an array of 20 words, each containing 13 bits of
1147   * data, in little-endian order. On input, upper word may be up to 13
1148   * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1149   * and is lower than twice the modulus.
1150   */
1151  static void
1152  square_f256(uint32_t *d, const uint32_t *a)
1153  {
1154  	uint32_t t[40], cc;
1155  	int i;
1156  
1157  	/*
1158  	 * Compute raw square. All result words fit in 13 bits each.
1159  	 */
1160  	square20(t, a);
1161  
1162  	/*
1163  	 * Modular reduction: each high word in added/subtracted where
1164  	 * necessary.
1165  	 *
1166  	 * The modulus is:
1167  	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1168  	 * Therefore:
1169  	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1170  	 *
1171  	 * For a word x at bit offset n (n >= 256), we have:
1172  	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
1173  	 *            - x*2^(n - 160) + x*2^(n-256) mod p
1174  	 *
1175  	 * Thus, we can nullify the high word if we reinject it at some
1176  	 * proper emplacements.
1177  	 */
1178  	for (i = 39; i >= 20; i --) {
1179  		uint32_t x;
1180  
1181  		x = t[i];
1182  		t[i - 2] += ARSH(x, 6);
1183  		t[i - 3] += (x << 7) & 0x1FFF;
1184  		t[i - 4] -= ARSH(x, 12);
1185  		t[i - 5] -= (x << 1) & 0x1FFF;
1186  		t[i - 12] -= ARSH(x, 4);
1187  		t[i - 13] -= (x << 9) & 0x1FFF;
1188  		t[i - 19] += ARSH(x, 9);
1189  		t[i - 20] += (x << 4) & 0x1FFF;
1190  	}
1191  
1192  	/*
1193  	 * Propagate carries. This is a signed propagation, and the
1194  	 * result may be negative. The loop above may enlarge values,
1195  	 * but not two much: worst case is the chain involving t[i - 3],
1196  	 * in which a value may be added to itself up to 7 times. Since
1197  	 * starting values are 13-bit each, all words fit on 20 bits
1198  	 * (21 to account for the sign bit).
1199  	 */
1200  	cc = norm13(t, t, 20);
1201  
1202  	/*
1203  	 * Perform modular reduction again for the bits beyond 256 (the carry
1204  	 * and the bits 256..259). Since the largest shift below is by 10
1205  	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1206  	 * thereby allowing injecting full word values.
1207  	 */
1208  	cc = (cc << 4) | (t[19] >> 9);
1209  	t[19] &= 0x01FF;
1210  	t[17] += cc << 3;
1211  	t[14] -= cc << 10;
1212  	t[7] -= cc << 5;
1213  	t[0] += cc;
1214  
1215  	/*
1216  	 * If the carry is negative, then after carry propagation, we may
1217  	 * end up with a value which is negative, and we don't want that.
1218  	 * Thus, in that case, we add the modulus. Note that the subtraction
1219  	 * result, when the carry is negative, is always smaller than the
1220  	 * modulus, so the extra addition will not make the value exceed
1221  	 * twice the modulus.
1222  	 */
1223  	cc >>= 31;
1224  	t[0] -= cc;
1225  	t[7] += cc << 5;
1226  	t[14] += cc << 10;
1227  	t[17] -= cc << 3;
1228  	t[19] += cc << 9;
1229  
1230  	norm13(d, t, 20);
1231  }
1232  
1233  /*
1234   * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1235   * are such that:
1236   *   X = x / z^2
1237   *   Y = y / z^3
1238   * For the point at infinity, z = 0.
1239   * Each point thus admits many possible representations.
1240   *
1241   * Coordinates are represented in arrays of 32-bit integers, each holding
1242   * 13 bits of data. Values may also be slightly greater than the modulus,
1243   * but they will always be lower than twice the modulus.
1244   */
1245  typedef struct {
1246  	uint32_t x[20];
1247  	uint32_t y[20];
1248  	uint32_t z[20];
1249  } p256_jacobian;
1250  
1251  /*
1252   * Convert a point to affine coordinates:
1253   *  - If the point is the point at infinity, then all three coordinates
1254   *    are set to 0.
1255   *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1256   *    coordinates are the 'X' and 'Y' affine coordinates.
1257   * The coordinates are guaranteed to be lower than the modulus.
1258   */
1259  static void
1260  p256_to_affine(p256_jacobian *P)
1261  {
1262  	uint32_t t1[20], t2[20];
1263  	int i;
1264  
1265  	/*
1266  	 * Invert z with a modular exponentiation: the modulus is
1267  	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
1268  	 * p-2. Exponent bit pattern (from high to low) is:
1269  	 *  - 32 bits of value 1
1270  	 *  - 31 bits of value 0
1271  	 *  - 1 bit of value 1
1272  	 *  - 96 bits of value 0
1273  	 *  - 94 bits of value 1
1274  	 *  - 1 bit of value 0
1275  	 *  - 1 bit of value 1
1276  	 * Thus, we precompute z^(2^31-1) to speed things up.
1277  	 *
1278  	 * If z = 0 (point at infinity) then the modular exponentiation
1279  	 * will yield 0, which leads to the expected result (all three
1280  	 * coordinates set to 0).
1281  	 */
1282  
1283  	/*
1284  	 * A simple square-and-multiply for z^(2^31-1). We could save about
1285  	 * two dozen multiplications here with an addition chain, but
1286  	 * this would require a bit more code, and extra stack buffers.
1287  	 */
1288  	memcpy(t1, P->z, sizeof P->z);
1289  	for (i = 0; i < 30; i ++) {
1290  		square_f256(t1, t1);
1291  		mul_f256(t1, t1, P->z);
1292  	}
1293  
1294  	/*
1295  	 * Square-and-multiply. Apart from the squarings, we have a few
1296  	 * multiplications to set bits to 1; we multiply by the original z
1297  	 * for setting 1 bit, and by t1 for setting 31 bits.
1298  	 */
1299  	memcpy(t2, P->z, sizeof P->z);
1300  	for (i = 1; i < 256; i ++) {
1301  		square_f256(t2, t2);
1302  		switch (i) {
1303  		case 31:
1304  		case 190:
1305  		case 221:
1306  		case 252:
1307  			mul_f256(t2, t2, t1);
1308  			break;
1309  		case 63:
1310  		case 253:
1311  		case 255:
1312  			mul_f256(t2, t2, P->z);
1313  			break;
1314  		}
1315  	}
1316  
1317  	/*
1318  	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
1319  	 */
1320  	mul_f256(t1, t2, t2);
1321  	mul_f256(P->x, t1, P->x);
1322  	mul_f256(t1, t1, t2);
1323  	mul_f256(P->y, t1, P->y);
1324  	reduce_final_f256(P->x);
1325  	reduce_final_f256(P->y);
1326  
1327  	/*
1328  	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
1329  	 * this will set z to 1.
1330  	 */
1331  	mul_f256(P->z, P->z, t2);
1332  	reduce_final_f256(P->z);
1333  }
1334  
1335  /*
1336   * Double a point in P-256. This function works for all valid points,
1337   * including the point at infinity.
1338   */
1339  static void
1340  p256_double(p256_jacobian *Q)
1341  {
1342  	/*
1343  	 * Doubling formulas are:
1344  	 *
1345  	 *   s = 4*x*y^2
1346  	 *   m = 3*(x + z^2)*(x - z^2)
1347  	 *   x' = m^2 - 2*s
1348  	 *   y' = m*(s - x') - 8*y^4
1349  	 *   z' = 2*y*z
1350  	 *
1351  	 * These formulas work for all points, including points of order 2
1352  	 * and points at infinity:
1353  	 *   - If y = 0 then z' = 0. But there is no such point in P-256
1354  	 *     anyway.
1355  	 *   - If z = 0 then z' = 0.
1356  	 */
1357  	uint32_t t1[20], t2[20], t3[20], t4[20];
1358  	int i;
1359  
1360  	/*
1361  	 * Compute z^2 in t1.
1362  	 */
1363  	square_f256(t1, Q->z);
1364  
1365  	/*
1366  	 * Compute x-z^2 in t2 and x+z^2 in t1.
1367  	 */
1368  	for (i = 0; i < 20; i ++) {
1369  		t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1370  		t1[i] += Q->x[i];
1371  	}
1372  	norm13(t1, t1, 20);
1373  	norm13(t2, t2, 20);
1374  
1375  	/*
1376  	 * Compute 3*(x+z^2)*(x-z^2) in t1.
1377  	 */
1378  	mul_f256(t3, t1, t2);
1379  	for (i = 0; i < 20; i ++) {
1380  		t1[i] = MUL15(3, t3[i]);
1381  	}
1382  	norm13(t1, t1, 20);
1383  
1384  	/*
1385  	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1386  	 */
1387  	square_f256(t3, Q->y);
1388  	for (i = 0; i < 20; i ++) {
1389  		t3[i] <<= 1;
1390  	}
1391  	norm13(t3, t3, 20);
1392  	mul_f256(t2, Q->x, t3);
1393  	for (i = 0; i < 20; i ++) {
1394  		t2[i] <<= 1;
1395  	}
1396  	norm13(t2, t2, 20);
1397  	reduce_f256(t2);
1398  
1399  	/*
1400  	 * Compute x' = m^2 - 2*s.
1401  	 */
1402  	square_f256(Q->x, t1);
1403  	for (i = 0; i < 20; i ++) {
1404  		Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1405  	}
1406  	norm13(Q->x, Q->x, 20);
1407  	reduce_f256(Q->x);
1408  
1409  	/*
1410  	 * Compute z' = 2*y*z.
1411  	 */
1412  	mul_f256(t4, Q->y, Q->z);
1413  	for (i = 0; i < 20; i ++) {
1414  		Q->z[i] = t4[i] << 1;
1415  	}
1416  	norm13(Q->z, Q->z, 20);
1417  	reduce_f256(Q->z);
1418  
1419  	/*
1420  	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1421  	 * 2*y^2 in t3.
1422  	 */
1423  	for (i = 0; i < 20; i ++) {
1424  		t2[i] += (F256[i] << 1) - Q->x[i];
1425  	}
1426  	norm13(t2, t2, 20);
1427  	mul_f256(Q->y, t1, t2);
1428  	square_f256(t4, t3);
1429  	for (i = 0; i < 20; i ++) {
1430  		Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1431  	}
1432  	norm13(Q->y, Q->y, 20);
1433  	reduce_f256(Q->y);
1434  }
1435  
1436  /*
1437   * Add point P2 to point P1.
1438   *
1439   * This function computes the wrong result in the following cases:
1440   *
1441   *   - If P1 == 0 but P2 != 0
1442   *   - If P1 != 0 but P2 == 0
1443   *   - If P1 == P2
1444   *
1445   * In all three cases, P1 is set to the point at infinity.
1446   *
1447   * Returned value is 0 if one of the following occurs:
1448   *
1449   *   - P1 and P2 have the same Y coordinate
1450   *   - P1 == 0 and P2 == 0
1451   *   - The Y coordinate of one of the points is 0 and the other point is
1452   *     the point at infinity.
1453   *
1454   * The third case cannot actually happen with valid points, since a point
1455   * with Y == 0 is a point of order 2, and there is no point of order 2 on
1456   * curve P-256.
1457   *
1458   * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1459   * can apply the following:
1460   *
1461   *   - If the result is not the point at infinity, then it is correct.
1462   *   - Otherwise, if the returned value is 1, then this is a case of
1463   *     P1+P2 == 0, so the result is indeed the point at infinity.
1464   *   - Otherwise, P1 == P2, so a "double" operation should have been
1465   *     performed.
1466   */
1467  static uint32_t
1468  p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1469  {
1470  	/*
1471  	 * Addtions formulas are:
1472  	 *
1473  	 *   u1 = x1 * z2^2
1474  	 *   u2 = x2 * z1^2
1475  	 *   s1 = y1 * z2^3
1476  	 *   s2 = y2 * z1^3
1477  	 *   h = u2 - u1
1478  	 *   r = s2 - s1
1479  	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
1480  	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1481  	 *   z3 = h * z1 * z2
1482  	 */
1483  	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1484  	uint32_t ret;
1485  	int i;
1486  
1487  	/*
1488  	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1489  	 */
1490  	square_f256(t3, P2->z);
1491  	mul_f256(t1, P1->x, t3);
1492  	mul_f256(t4, P2->z, t3);
1493  	mul_f256(t3, P1->y, t4);
1494  
1495  	/*
1496  	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1497  	 */
1498  	square_f256(t4, P1->z);
1499  	mul_f256(t2, P2->x, t4);
1500  	mul_f256(t5, P1->z, t4);
1501  	mul_f256(t4, P2->y, t5);
1502  
1503  	/*
1504  	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1505  	 * We need to test whether r is zero, so we will do some extra
1506  	 * reduce.
1507  	 */
1508  	for (i = 0; i < 20; i ++) {
1509  		t2[i] += (F256[i] << 1) - t1[i];
1510  		t4[i] += (F256[i] << 1) - t3[i];
1511  	}
1512  	norm13(t2, t2, 20);
1513  	norm13(t4, t4, 20);
1514  	reduce_f256(t4);
1515  	reduce_final_f256(t4);
1516  	ret = 0;
1517  	for (i = 0; i < 20; i ++) {
1518  		ret |= t4[i];
1519  	}
1520  	ret = (ret | -ret) >> 31;
1521  
1522  	/*
1523  	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1524  	 */
1525  	square_f256(t7, t2);
1526  	mul_f256(t6, t1, t7);
1527  	mul_f256(t5, t7, t2);
1528  
1529  	/*
1530  	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1531  	 */
1532  	square_f256(P1->x, t4);
1533  	for (i = 0; i < 20; i ++) {
1534  		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1535  	}
1536  	norm13(P1->x, P1->x, 20);
1537  	reduce_f256(P1->x);
1538  
1539  	/*
1540  	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1541  	 */
1542  	for (i = 0; i < 20; i ++) {
1543  		t6[i] += (F256[i] << 1) - P1->x[i];
1544  	}
1545  	norm13(t6, t6, 20);
1546  	mul_f256(P1->y, t4, t6);
1547  	mul_f256(t1, t5, t3);
1548  	for (i = 0; i < 20; i ++) {
1549  		P1->y[i] += (F256[i] << 1) - t1[i];
1550  	}
1551  	norm13(P1->y, P1->y, 20);
1552  	reduce_f256(P1->y);
1553  
1554  	/*
1555  	 * Compute z3 = h*z1*z2.
1556  	 */
1557  	mul_f256(t1, P1->z, P2->z);
1558  	mul_f256(P1->z, t1, t2);
1559  
1560  	return ret;
1561  }
1562  
1563  /*
1564   * Add point P2 to point P1. This is a specialised function for the
1565   * case when P2 is a non-zero point in affine coordinate.
1566   *
1567   * This function computes the wrong result in the following cases:
1568   *
1569   *   - If P1 == 0
1570   *   - If P1 == P2
1571   *
1572   * In both cases, P1 is set to the point at infinity.
1573   *
1574   * Returned value is 0 if one of the following occurs:
1575   *
1576   *   - P1 and P2 have the same Y coordinate
1577   *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1578   *
1579   * The second case cannot actually happen with valid points, since a point
1580   * with Y == 0 is a point of order 2, and there is no point of order 2 on
1581   * curve P-256.
1582   *
1583   * Therefore, assuming that P1 != 0 on input, then the caller
1584   * can apply the following:
1585   *
1586   *   - If the result is not the point at infinity, then it is correct.
1587   *   - Otherwise, if the returned value is 1, then this is a case of
1588   *     P1+P2 == 0, so the result is indeed the point at infinity.
1589   *   - Otherwise, P1 == P2, so a "double" operation should have been
1590   *     performed.
1591   */
1592  static uint32_t
1593  p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
1594  {
1595  	/*
1596  	 * Addtions formulas are:
1597  	 *
1598  	 *   u1 = x1
1599  	 *   u2 = x2 * z1^2
1600  	 *   s1 = y1
1601  	 *   s2 = y2 * z1^3
1602  	 *   h = u2 - u1
1603  	 *   r = s2 - s1
1604  	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
1605  	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1606  	 *   z3 = h * z1
1607  	 */
1608  	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1609  	uint32_t ret;
1610  	int i;
1611  
1612  	/*
1613  	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1614  	 */
1615  	memcpy(t1, P1->x, sizeof t1);
1616  	memcpy(t3, P1->y, sizeof t3);
1617  
1618  	/*
1619  	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1620  	 */
1621  	square_f256(t4, P1->z);
1622  	mul_f256(t2, P2->x, t4);
1623  	mul_f256(t5, P1->z, t4);
1624  	mul_f256(t4, P2->y, t5);
1625  
1626  	/*
1627  	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1628  	 * We need to test whether r is zero, so we will do some extra
1629  	 * reduce.
1630  	 */
1631  	for (i = 0; i < 20; i ++) {
1632  		t2[i] += (F256[i] << 1) - t1[i];
1633  		t4[i] += (F256[i] << 1) - t3[i];
1634  	}
1635  	norm13(t2, t2, 20);
1636  	norm13(t4, t4, 20);
1637  	reduce_f256(t4);
1638  	reduce_final_f256(t4);
1639  	ret = 0;
1640  	for (i = 0; i < 20; i ++) {
1641  		ret |= t4[i];
1642  	}
1643  	ret = (ret | -ret) >> 31;
1644  
1645  	/*
1646  	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1647  	 */
1648  	square_f256(t7, t2);
1649  	mul_f256(t6, t1, t7);
1650  	mul_f256(t5, t7, t2);
1651  
1652  	/*
1653  	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1654  	 */
1655  	square_f256(P1->x, t4);
1656  	for (i = 0; i < 20; i ++) {
1657  		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1658  	}
1659  	norm13(P1->x, P1->x, 20);
1660  	reduce_f256(P1->x);
1661  
1662  	/*
1663  	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1664  	 */
1665  	for (i = 0; i < 20; i ++) {
1666  		t6[i] += (F256[i] << 1) - P1->x[i];
1667  	}
1668  	norm13(t6, t6, 20);
1669  	mul_f256(P1->y, t4, t6);
1670  	mul_f256(t1, t5, t3);
1671  	for (i = 0; i < 20; i ++) {
1672  		P1->y[i] += (F256[i] << 1) - t1[i];
1673  	}
1674  	norm13(P1->y, P1->y, 20);
1675  	reduce_f256(P1->y);
1676  
1677  	/*
1678  	 * Compute z3 = h*z1*z2.
1679  	 */
1680  	mul_f256(P1->z, P1->z, t2);
1681  
1682  	return ret;
1683  }
1684  
1685  /*
1686   * Decode a P-256 point. This function does not support the point at
1687   * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1688   */
1689  static uint32_t
1690  p256_decode(p256_jacobian *P, const void *src, size_t len)
1691  {
1692  	const unsigned char *buf;
1693  	uint32_t tx[20], ty[20], t1[20], t2[20];
1694  	uint32_t bad;
1695  	int i;
1696  
1697  	if (len != 65) {
1698  		return 0;
1699  	}
1700  	buf = src;
1701  
1702  	/*
1703  	 * First byte must be 0x04 (uncompressed format). We could support
1704  	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1705  	 * least significant bit of the Y coordinate), but it is explicitly
1706  	 * forbidden by RFC 5480 (section 2.2).
1707  	 */
1708  	bad = NEQ(buf[0], 0x04);
1709  
1710  	/*
1711  	 * Decode the coordinates, and check that they are both lower
1712  	 * than the modulus.
1713  	 */
1714  	tx[19] = be8_to_le13(tx, buf + 1, 32);
1715  	ty[19] = be8_to_le13(ty, buf + 33, 32);
1716  	bad |= reduce_final_f256(tx);
1717  	bad |= reduce_final_f256(ty);
1718  
1719  	/*
1720  	 * Check curve equation.
1721  	 */
1722  	square_f256(t1, tx);
1723  	mul_f256(t1, tx, t1);
1724  	square_f256(t2, ty);
1725  	for (i = 0; i < 20; i ++) {
1726  		t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1727  	}
1728  	norm13(t1, t1, 20);
1729  	reduce_f256(t1);
1730  	reduce_final_f256(t1);
1731  	for (i = 0; i < 20; i ++) {
1732  		bad |= t1[i];
1733  	}
1734  
1735  	/*
1736  	 * Copy coordinates to the point structure.
1737  	 */
1738  	memcpy(P->x, tx, sizeof tx);
1739  	memcpy(P->y, ty, sizeof ty);
1740  	memset(P->z, 0, sizeof P->z);
1741  	P->z[0] = 1;
1742  	return EQ(bad, 0);
1743  }
1744  
1745  /*
1746   * Encode a point into a buffer. This function assumes that the point is
1747   * valid, in affine coordinates, and not the point at infinity.
1748   */
1749  static void
1750  p256_encode(void *dst, const p256_jacobian *P)
1751  {
1752  	unsigned char *buf;
1753  
1754  	buf = dst;
1755  	buf[0] = 0x04;
1756  	le13_to_be8(buf + 1, 32, P->x);
1757  	le13_to_be8(buf + 33, 32, P->y);
1758  }
1759  
1760  /*
1761   * Multiply a curve point by an integer. The integer is assumed to be
1762   * lower than the curve order, and the base point must not be the point
1763   * at infinity.
1764   */
1765  static void
1766  p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1767  {
1768  	/*
1769  	 * qz is a flag that is initially 1, and remains equal to 1
1770  	 * as long as the point is the point at infinity.
1771  	 *
1772  	 * We use a 2-bit window to handle multiplier bits by pairs.
1773  	 * The precomputed window really is the points P2 and P3.
1774  	 */
1775  	uint32_t qz;
1776  	p256_jacobian P2, P3, Q, T, U;
1777  
1778  	/*
1779  	 * Compute window values.
1780  	 */
1781  	P2 = *P;
1782  	p256_double(&P2);
1783  	P3 = *P;
1784  	p256_add(&P3, &P2);
1785  
1786  	/*
1787  	 * We start with Q = 0. We process multiplier bits 2 by 2.
1788  	 */
1789  	memset(&Q, 0, sizeof Q);
1790  	qz = 1;
1791  	while (xlen -- > 0) {
1792  		int k;
1793  
1794  		for (k = 6; k >= 0; k -= 2) {
1795  			uint32_t bits;
1796  			uint32_t bnz;
1797  
1798  			p256_double(&Q);
1799  			p256_double(&Q);
1800  			T = *P;
1801  			U = Q;
1802  			bits = (*x >> k) & (uint32_t)3;
1803  			bnz = NEQ(bits, 0);
1804  			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1805  			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1806  			p256_add(&U, &T);
1807  			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1808  			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1809  			qz &= ~bnz;
1810  		}
1811  		x ++;
1812  	}
1813  	*P = Q;
1814  }
1815  
1816  /*
1817   * Precomputed window: k*G points, where G is the curve generator, and k
1818   * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1819   * the point are encoded as 20 words of 13 bits each (little-endian
1820   * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1821   * (little-endian order within each word).
1822   */
1823  static const uint32_t Gwin[15][20] = {
1824  
1825  	{ 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
1826  	  0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
1827  	  0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
1828  	  0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
1829  	  0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
1830  
1831  	{ 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
1832  	  0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
1833  	  0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
1834  	  0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
1835  	  0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
1836  
1837  	{ 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
1838  	  0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
1839  	  0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
1840  	  0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
1841  	  0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
1842  
1843  	{ 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
1844  	  0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
1845  	  0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
1846  	  0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
1847  	  0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
1848  
1849  	{ 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
1850  	  0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
1851  	  0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
1852  	  0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
1853  	  0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
1854  
1855  	{ 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
1856  	  0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
1857  	  0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
1858  	  0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
1859  	  0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
1860  
1861  	{ 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
1862  	  0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
1863  	  0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
1864  	  0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
1865  	  0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
1866  
1867  	{ 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
1868  	  0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
1869  	  0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
1870  	  0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
1871  	  0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
1872  
1873  	{ 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
1874  	  0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
1875  	  0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
1876  	  0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
1877  	  0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
1878  
1879  	{ 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
1880  	  0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
1881  	  0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
1882  	  0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
1883  	  0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
1884  
1885  	{ 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
1886  	  0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
1887  	  0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
1888  	  0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
1889  	  0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
1890  
1891  	{ 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
1892  	  0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
1893  	  0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
1894  	  0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
1895  	  0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
1896  
1897  	{ 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
1898  	  0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
1899  	  0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
1900  	  0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
1901  	  0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
1902  
1903  	{ 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
1904  	  0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
1905  	  0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
1906  	  0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
1907  	  0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
1908  
1909  	{ 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
1910  	  0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
1911  	  0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
1912  	  0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
1913  	  0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
1914  };
1915  
1916  /*
1917   * Lookup one of the Gwin[] values, by index. This is constant-time.
1918   */
1919  static void
1920  lookup_Gwin(p256_jacobian *T, uint32_t idx)
1921  {
1922  	uint32_t xy[20];
1923  	uint32_t k;
1924  	size_t u;
1925  
1926  	memset(xy, 0, sizeof xy);
1927  	for (k = 0; k < 15; k ++) {
1928  		uint32_t m;
1929  
1930  		m = -EQ(idx, k + 1);
1931  		for (u = 0; u < 20; u ++) {
1932  			xy[u] |= m & Gwin[k][u];
1933  		}
1934  	}
1935  	for (u = 0; u < 10; u ++) {
1936  		T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
1937  		T->x[(u << 1) + 1] = xy[u] >> 16;
1938  		T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
1939  		T->y[(u << 1) + 1] = xy[u + 10] >> 16;
1940  	}
1941  	memset(T->z, 0, sizeof T->z);
1942  	T->z[0] = 1;
1943  }
1944  
1945  /*
1946   * Multiply the generator by an integer. The integer is assumed non-zero
1947   * and lower than the curve order.
1948   */
1949  static void
1950  p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1951  {
1952  	/*
1953  	 * qz is a flag that is initially 1, and remains equal to 1
1954  	 * as long as the point is the point at infinity.
1955  	 *
1956  	 * We use a 4-bit window to handle multiplier bits by groups
1957  	 * of 4. The precomputed window is constant static data, with
1958  	 * points in affine coordinates; we use a constant-time lookup.
1959  	 */
1960  	p256_jacobian Q;
1961  	uint32_t qz;
1962  
1963  	memset(&Q, 0, sizeof Q);
1964  	qz = 1;
1965  	while (xlen -- > 0) {
1966  		int k;
1967  		unsigned bx;
1968  
1969  		bx = *x ++;
1970  		for (k = 0; k < 2; k ++) {
1971  			uint32_t bits;
1972  			uint32_t bnz;
1973  			p256_jacobian T, U;
1974  
1975  			p256_double(&Q);
1976  			p256_double(&Q);
1977  			p256_double(&Q);
1978  			p256_double(&Q);
1979  			bits = (bx >> 4) & 0x0F;
1980  			bnz = NEQ(bits, 0);
1981  			lookup_Gwin(&T, bits);
1982  			U = Q;
1983  			p256_add_mixed(&U, &T);
1984  			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1985  			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1986  			qz &= ~bnz;
1987  			bx <<= 4;
1988  		}
1989  	}
1990  	*P = Q;
1991  }
1992  
1993  static const unsigned char P256_G[] = {
1994  	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1995  	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1996  	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1997  	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1998  	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1999  	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
2000  	0x68, 0x37, 0xBF, 0x51, 0xF5
2001  };
2002  
2003  static const unsigned char P256_N[] = {
2004  	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
2005  	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
2006  	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
2007  	0x25, 0x51
2008  };
2009  
2010  static const unsigned char *
2011  api_generator(int curve, size_t *len)
2012  {
2013  	(void)curve;
2014  	*len = sizeof P256_G;
2015  	return P256_G;
2016  }
2017  
2018  static const unsigned char *
2019  api_order(int curve, size_t *len)
2020  {
2021  	(void)curve;
2022  	*len = sizeof P256_N;
2023  	return P256_N;
2024  }
2025  
2026  static size_t
2027  api_xoff(int curve, size_t *len)
2028  {
2029  	(void)curve;
2030  	*len = 32;
2031  	return 1;
2032  }
2033  
2034  static uint32_t
2035  api_mul(unsigned char *G, size_t Glen,
2036  	const unsigned char *x, size_t xlen, int curve)
2037  {
2038  	uint32_t r;
2039  	p256_jacobian P;
2040  
2041  	(void)curve;
2042  	if (Glen != 65) {
2043  		return 0;
2044  	}
2045  	r = p256_decode(&P, G, Glen);
2046  	p256_mul(&P, x, xlen);
2047  	p256_to_affine(&P);
2048  	p256_encode(G, &P);
2049  	return r;
2050  }
2051  
2052  static size_t
2053  api_mulgen(unsigned char *R,
2054  	const unsigned char *x, size_t xlen, int curve)
2055  {
2056  	p256_jacobian P;
2057  
2058  	(void)curve;
2059  	p256_mulgen(&P, x, xlen);
2060  	p256_to_affine(&P);
2061  	p256_encode(R, &P);
2062  	return 65;
2063  }
2064  
2065  static uint32_t
2066  api_muladd(unsigned char *A, const unsigned char *B, size_t len,
2067  	const unsigned char *x, size_t xlen,
2068  	const unsigned char *y, size_t ylen, int curve)
2069  {
2070  	p256_jacobian P, Q;
2071  	uint32_t r, t, z;
2072  	int i;
2073  
2074  	(void)curve;
2075  	if (len != 65) {
2076  		return 0;
2077  	}
2078  	r = p256_decode(&P, A, len);
2079  	p256_mul(&P, x, xlen);
2080  	if (B == NULL) {
2081  		p256_mulgen(&Q, y, ylen);
2082  	} else {
2083  		r &= p256_decode(&Q, B, len);
2084  		p256_mul(&Q, y, ylen);
2085  	}
2086  
2087  	/*
2088  	 * The final addition may fail in case both points are equal.
2089  	 */
2090  	t = p256_add(&P, &Q);
2091  	reduce_final_f256(P.z);
2092  	z = 0;
2093  	for (i = 0; i < 20; i ++) {
2094  		z |= P.z[i];
2095  	}
2096  	z = EQ(z, 0);
2097  	p256_double(&Q);
2098  
2099  	/*
2100  	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
2101  	 * have the following:
2102  	 *
2103  	 *   z = 0, t = 0   return P (normal addition)
2104  	 *   z = 0, t = 1   return P (normal addition)
2105  	 *   z = 1, t = 0   return Q (a 'double' case)
2106  	 *   z = 1, t = 1   report an error (P+Q = 0)
2107  	 */
2108  	CCOPY(z & ~t, &P, &Q, sizeof Q);
2109  	p256_to_affine(&P);
2110  	p256_encode(A, &P);
2111  	r &= ~(z & t);
2112  	return r;
2113  }
2114  
2115  /* see bearssl_ec.h */
2116  const br_ec_impl br_ec_p256_m15 = {
2117  	(uint32_t)0x00800000,
2118  	&api_generator,
2119  	&api_order,
2120  	&api_xoff,
2121  	&api_mul,
2122  	&api_mulgen,
2123  	&api_muladd
2124  };