/ src / ec / ec_p256_m31.c
ec_p256_m31.c
   1  /*
   2   * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
   3   *
   4   * Permission is hereby granted, free of charge, to any person obtaining 
   5   * a copy of this software and associated documentation files (the
   6   * "Software"), to deal in the Software without restriction, including
   7   * without limitation the rights to use, copy, modify, merge, publish,
   8   * distribute, sublicense, and/or sell copies of the Software, and to
   9   * permit persons to whom the Software is furnished to do so, subject to
  10   * the following conditions:
  11   *
  12   * The above copyright notice and this permission notice shall be 
  13   * included in all copies or substantial portions of the Software.
  14   *
  15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
  16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
  18   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  19   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  20   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  21   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22   * SOFTWARE.
  23   */
  24  
  25  #include "inner.h"
  26  
  27  /*
  28   * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
  29   * that right-shifting a signed negative integer copies the sign bit
  30   * (arithmetic right-shift). This is "implementation-defined behaviour",
  31   * i.e. it is not undefined, but it may differ between compilers. Each
  32   * compiler is supposed to document its behaviour in that respect. GCC
  33   * explicitly defines that an arithmetic right shift is used. We expect
  34   * all other compilers to do the same, because underlying CPU offer an
  35   * arithmetic right shift opcode that could not be used otherwise.
  36   */
  37  #if BR_NO_ARITH_SHIFT
  38  #define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
  39                        | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
  40  #define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
  41                        | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
  42  #else
  43  #define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
  44  #define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
  45  #endif
  46  
  47  /*
  48   * Convert an integer from unsigned big-endian encoding to a sequence of
  49   * 30-bit words in little-endian order. The final "partial" word is
  50   * returned.
  51   */
  52  static uint32_t
  53  be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
  54  {
  55  	uint32_t acc;
  56  	int acc_len;
  57  
  58  	acc = 0;
  59  	acc_len = 0;
  60  	while (len -- > 0) {
  61  		uint32_t b;
  62  
  63  		b = src[len];
  64  		if (acc_len < 22) {
  65  			acc |= b << acc_len;
  66  			acc_len += 8;
  67  		} else {
  68  			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
  69  			acc = b >> (30 - acc_len);
  70  			acc_len -= 22;
  71  		}
  72  	}
  73  	return acc;
  74  }
  75  
  76  /*
  77   * Convert an integer (30-bit words, little-endian) to unsigned
  78   * big-endian encoding. The total encoding length is provided; all
  79   * the destination bytes will be filled.
  80   */
  81  static void
  82  le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
  83  {
  84  	uint32_t acc;
  85  	int acc_len;
  86  
  87  	acc = 0;
  88  	acc_len = 0;
  89  	while (len -- > 0) {
  90  		if (acc_len < 8) {
  91  			uint32_t w;
  92  
  93  			w = *src ++;
  94  			dst[len] = (unsigned char)(acc | (w << acc_len));
  95  			acc = w >> (8 - acc_len);
  96  			acc_len += 22;
  97  		} else {
  98  			dst[len] = (unsigned char)acc;
  99  			acc >>= 8;
 100  			acc_len -= 8;
 101  		}
 102  	}
 103  }
 104  
 105  /*
 106   * Multiply two integers. Source integers are represented as arrays of
 107   * nine 30-bit words, for values up to 2^270-1. Result is encoded over
 108   * 18 words of 30 bits each.
 109   */
 110  static void
 111  mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
 112  {
 113  	/*
 114  	 * Maximum intermediate result is no more than
 115  	 * 10376293531797946367, which fits in 64 bits. Reason:
 116  	 *
 117  	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
 118  	 *   10376293531797946367 < 9663676407 * 2^30
 119  	 *
 120  	 * Thus, adding together 9 products of 30-bit integers, with
 121  	 * a carry of at most 9663676406, yields an integer that fits
 122  	 * on 64 bits and generates a carry of at most 9663676406.
 123  	 */
 124  	uint64_t t[17];
 125  	uint64_t cc;
 126  	int i;
 127  
 128  	t[ 0] = MUL31(a[0], b[0]);
 129  	t[ 1] = MUL31(a[0], b[1])
 130  		+ MUL31(a[1], b[0]);
 131  	t[ 2] = MUL31(a[0], b[2])
 132  		+ MUL31(a[1], b[1])
 133  		+ MUL31(a[2], b[0]);
 134  	t[ 3] = MUL31(a[0], b[3])
 135  		+ MUL31(a[1], b[2])
 136  		+ MUL31(a[2], b[1])
 137  		+ MUL31(a[3], b[0]);
 138  	t[ 4] = MUL31(a[0], b[4])
 139  		+ MUL31(a[1], b[3])
 140  		+ MUL31(a[2], b[2])
 141  		+ MUL31(a[3], b[1])
 142  		+ MUL31(a[4], b[0]);
 143  	t[ 5] = MUL31(a[0], b[5])
 144  		+ MUL31(a[1], b[4])
 145  		+ MUL31(a[2], b[3])
 146  		+ MUL31(a[3], b[2])
 147  		+ MUL31(a[4], b[1])
 148  		+ MUL31(a[5], b[0]);
 149  	t[ 6] = MUL31(a[0], b[6])
 150  		+ MUL31(a[1], b[5])
 151  		+ MUL31(a[2], b[4])
 152  		+ MUL31(a[3], b[3])
 153  		+ MUL31(a[4], b[2])
 154  		+ MUL31(a[5], b[1])
 155  		+ MUL31(a[6], b[0]);
 156  	t[ 7] = MUL31(a[0], b[7])
 157  		+ MUL31(a[1], b[6])
 158  		+ MUL31(a[2], b[5])
 159  		+ MUL31(a[3], b[4])
 160  		+ MUL31(a[4], b[3])
 161  		+ MUL31(a[5], b[2])
 162  		+ MUL31(a[6], b[1])
 163  		+ MUL31(a[7], b[0]);
 164  	t[ 8] = MUL31(a[0], b[8])
 165  		+ MUL31(a[1], b[7])
 166  		+ MUL31(a[2], b[6])
 167  		+ MUL31(a[3], b[5])
 168  		+ MUL31(a[4], b[4])
 169  		+ MUL31(a[5], b[3])
 170  		+ MUL31(a[6], b[2])
 171  		+ MUL31(a[7], b[1])
 172  		+ MUL31(a[8], b[0]);
 173  	t[ 9] = MUL31(a[1], b[8])
 174  		+ MUL31(a[2], b[7])
 175  		+ MUL31(a[3], b[6])
 176  		+ MUL31(a[4], b[5])
 177  		+ MUL31(a[5], b[4])
 178  		+ MUL31(a[6], b[3])
 179  		+ MUL31(a[7], b[2])
 180  		+ MUL31(a[8], b[1]);
 181  	t[10] = MUL31(a[2], b[8])
 182  		+ MUL31(a[3], b[7])
 183  		+ MUL31(a[4], b[6])
 184  		+ MUL31(a[5], b[5])
 185  		+ MUL31(a[6], b[4])
 186  		+ MUL31(a[7], b[3])
 187  		+ MUL31(a[8], b[2]);
 188  	t[11] = MUL31(a[3], b[8])
 189  		+ MUL31(a[4], b[7])
 190  		+ MUL31(a[5], b[6])
 191  		+ MUL31(a[6], b[5])
 192  		+ MUL31(a[7], b[4])
 193  		+ MUL31(a[8], b[3]);
 194  	t[12] = MUL31(a[4], b[8])
 195  		+ MUL31(a[5], b[7])
 196  		+ MUL31(a[6], b[6])
 197  		+ MUL31(a[7], b[5])
 198  		+ MUL31(a[8], b[4]);
 199  	t[13] = MUL31(a[5], b[8])
 200  		+ MUL31(a[6], b[7])
 201  		+ MUL31(a[7], b[6])
 202  		+ MUL31(a[8], b[5]);
 203  	t[14] = MUL31(a[6], b[8])
 204  		+ MUL31(a[7], b[7])
 205  		+ MUL31(a[8], b[6]);
 206  	t[15] = MUL31(a[7], b[8])
 207  		+ MUL31(a[8], b[7]);
 208  	t[16] = MUL31(a[8], b[8]);
 209  
 210  	/*
 211  	 * Propagate carries.
 212  	 */
 213  	cc = 0;
 214  	for (i = 0; i < 17; i ++) {
 215  		uint64_t w;
 216  
 217  		w = t[i] + cc;
 218  		d[i] = (uint32_t)w & 0x3FFFFFFF;
 219  		cc = w >> 30;
 220  	}
 221  	d[17] = (uint32_t)cc;
 222  }
 223  
 224  /*
 225   * Square a 270-bit integer, represented as an array of nine 30-bit words.
 226   * Result uses 18 words of 30 bits each.
 227   */
 228  static void
 229  square9(uint32_t *d, const uint32_t *a)
 230  {
 231  	uint64_t t[17];
 232  	uint64_t cc;
 233  	int i;
 234  
 235  	t[ 0] = MUL31(a[0], a[0]);
 236  	t[ 1] = ((MUL31(a[0], a[1])) << 1);
 237  	t[ 2] = MUL31(a[1], a[1])
 238  		+ ((MUL31(a[0], a[2])) << 1);
 239  	t[ 3] = ((MUL31(a[0], a[3])
 240  		+ MUL31(a[1], a[2])) << 1);
 241  	t[ 4] = MUL31(a[2], a[2])
 242  		+ ((MUL31(a[0], a[4])
 243  		+ MUL31(a[1], a[3])) << 1);
 244  	t[ 5] = ((MUL31(a[0], a[5])
 245  		+ MUL31(a[1], a[4])
 246  		+ MUL31(a[2], a[3])) << 1);
 247  	t[ 6] = MUL31(a[3], a[3])
 248  		+ ((MUL31(a[0], a[6])
 249  		+ MUL31(a[1], a[5])
 250  		+ MUL31(a[2], a[4])) << 1);
 251  	t[ 7] = ((MUL31(a[0], a[7])
 252  		+ MUL31(a[1], a[6])
 253  		+ MUL31(a[2], a[5])
 254  		+ MUL31(a[3], a[4])) << 1);
 255  	t[ 8] = MUL31(a[4], a[4])
 256  		+ ((MUL31(a[0], a[8])
 257  		+ MUL31(a[1], a[7])
 258  		+ MUL31(a[2], a[6])
 259  		+ MUL31(a[3], a[5])) << 1);
 260  	t[ 9] = ((MUL31(a[1], a[8])
 261  		+ MUL31(a[2], a[7])
 262  		+ MUL31(a[3], a[6])
 263  		+ MUL31(a[4], a[5])) << 1);
 264  	t[10] = MUL31(a[5], a[5])
 265  		+ ((MUL31(a[2], a[8])
 266  		+ MUL31(a[3], a[7])
 267  		+ MUL31(a[4], a[6])) << 1);
 268  	t[11] = ((MUL31(a[3], a[8])
 269  		+ MUL31(a[4], a[7])
 270  		+ MUL31(a[5], a[6])) << 1);
 271  	t[12] = MUL31(a[6], a[6])
 272  		+ ((MUL31(a[4], a[8])
 273  		+ MUL31(a[5], a[7])) << 1);
 274  	t[13] = ((MUL31(a[5], a[8])
 275  		+ MUL31(a[6], a[7])) << 1);
 276  	t[14] = MUL31(a[7], a[7])
 277  		+ ((MUL31(a[6], a[8])) << 1);
 278  	t[15] = ((MUL31(a[7], a[8])) << 1);
 279  	t[16] = MUL31(a[8], a[8]);
 280  
 281  	/*
 282  	 * Propagate carries.
 283  	 */
 284  	cc = 0;
 285  	for (i = 0; i < 17; i ++) {
 286  		uint64_t w;
 287  
 288  		w = t[i] + cc;
 289  		d[i] = (uint32_t)w & 0x3FFFFFFF;
 290  		cc = w >> 30;
 291  	}
 292  	d[17] = (uint32_t)cc;
 293  }
 294  
 295  /*
 296   * Base field modulus for P-256.
 297   */
 298  static const uint32_t F256[] = {
 299  
 300  	0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
 301  	0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
 302  };
 303  
 304  /*
 305   * The 'b' curve equation coefficient for P-256.
 306   */
 307  static const uint32_t P256_B[] = {
 308  
 309  	0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
 310  	0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
 311  };
 312  
 313  /*
 314   * Addition in the field. Source operands shall fit on 257 bits; output
 315   * will be lower than twice the modulus.
 316   */
 317  static void
 318  add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 319  {
 320  	uint32_t w, cc;
 321  	int i;
 322  
 323  	cc = 0;
 324  	for (i = 0; i < 9; i ++) {
 325  		w = a[i] + b[i] + cc;
 326  		d[i] = w & 0x3FFFFFFF;
 327  		cc = w >> 30;
 328  	}
 329  	w >>= 16;
 330  	d[8] &= 0xFFFF;
 331  	d[3] -= w << 6;
 332  	d[6] -= w << 12;
 333  	d[7] += w << 14;
 334  	cc = w;
 335  	for (i = 0; i < 9; i ++) {
 336  		w = d[i] + cc;
 337  		d[i] = w & 0x3FFFFFFF;
 338  		cc = ARSH(w, 30);
 339  	}
 340  }
 341  
 342  /*
 343   * Subtraction in the field. Source operands shall be smaller than twice
 344   * the modulus; the result will fulfil the same property.
 345   */
 346  static void
 347  sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 348  {
 349  	uint32_t w, cc;
 350  	int i;
 351  
 352  	/*
 353  	 * We really compute a - b + 2*p to make sure that the result is
 354  	 * positive.
 355  	 */
 356  	w = a[0] - b[0] - 0x00002;
 357  	d[0] = w & 0x3FFFFFFF;
 358  	w = a[1] - b[1] + ARSH(w, 30);
 359  	d[1] = w & 0x3FFFFFFF;
 360  	w = a[2] - b[2] + ARSH(w, 30);
 361  	d[2] = w & 0x3FFFFFFF;
 362  	w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
 363  	d[3] = w & 0x3FFFFFFF;
 364  	w = a[4] - b[4] + ARSH(w, 30);
 365  	d[4] = w & 0x3FFFFFFF;
 366  	w = a[5] - b[5] + ARSH(w, 30);
 367  	d[5] = w & 0x3FFFFFFF;
 368  	w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
 369  	d[6] = w & 0x3FFFFFFF;
 370  	w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
 371  	d[7] = w & 0x3FFFFFFF;
 372  	w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
 373  	d[8] = w & 0xFFFF;
 374  	w >>= 16;
 375  	d[8] &= 0xFFFF;
 376  	d[3] -= w << 6;
 377  	d[6] -= w << 12;
 378  	d[7] += w << 14;
 379  	cc = w;
 380  	for (i = 0; i < 9; i ++) {
 381  		w = d[i] + cc;
 382  		d[i] = w & 0x3FFFFFFF;
 383  		cc = ARSH(w, 30);
 384  	}
 385  }
 386  
 387  /*
 388   * Compute a multiplication in F256. Source operands shall be less than
 389   * twice the modulus.
 390   */
 391  static void
 392  mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
 393  {
 394  	uint32_t t[18];
 395  	uint64_t s[18];
 396  	uint64_t cc, x;
 397  	uint32_t z, c;
 398  	int i;
 399  
 400  	mul9(t, a, b);
 401  
 402  	/*
 403  	 * Modular reduction: each high word in added/subtracted where
 404  	 * necessary.
 405  	 *
 406  	 * The modulus is:
 407  	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 408  	 * Therefore:
 409  	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
 410  	 *
 411  	 * For a word x at bit offset n (n >= 256), we have:
 412  	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
 413  	 *            - x*2^(n - 160) + x*2^(n-256) mod p
 414  	 *
 415  	 * Thus, we can nullify the high word if we reinject it at some
 416  	 * proper emplacements.
 417  	 *
 418  	 * We use 64-bit intermediate words to allow for carries to
 419  	 * accumulate easily, before performing the final propagation.
 420  	 */
 421  	for (i = 0; i < 18; i ++) {
 422  		s[i] = t[i];
 423  	}
 424  
 425  	for (i = 17; i >= 9; i --) {
 426  		uint64_t y;
 427  
 428  		y = s[i];
 429  		s[i - 1] += ARSHW(y, 2);
 430  		s[i - 2] += (y << 28) & 0x3FFFFFFF;
 431  		s[i - 2] -= ARSHW(y, 4);
 432  		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
 433  		s[i - 5] -= ARSHW(y, 10);
 434  		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
 435  		s[i - 8] += ARSHW(y, 16);
 436  		s[i - 9] += (y << 14) & 0x3FFFFFFF;
 437  	}
 438  
 439  	/*
 440  	 * Carry propagation must be signed. Moreover, we may have overdone
 441  	 * it a bit, and obtain a negative result.
 442  	 *
 443  	 * The loop above ran 9 times; each time, each word was augmented
 444  	 * by at most one extra word (in absolute value). Thus, the top
 445  	 * word must in fine fit in 39 bits, so the carry below will fit
 446  	 * on 9 bits.
 447  	 */
 448  	cc = 0;
 449  	for (i = 0; i < 9; i ++) {
 450  		x = s[i] + cc;
 451  		d[i] = (uint32_t)x & 0x3FFFFFFF;
 452  		cc = ARSHW(x, 30);
 453  	}
 454  
 455  	/*
 456  	 * All nine words fit on 30 bits, but there may be an extra
 457  	 * carry for a few bits (at most 9), and that carry may be
 458  	 * negative. Moreover, we want the result to fit on 257 bits.
 459  	 * The two lines below ensure that the word in d[] has length
 460  	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
 461  	 * significant length of cc is less than 24 bits, so we will be
 462  	 * able to switch to 32-bit operations.
 463  	 */
 464  	cc = ARSHW(x, 16);
 465  	d[8] &= 0xFFFF;
 466  
 467  	/*
 468  	 * One extra round of reduction, for cc*2^256, which means
 469  	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
 470  	 * value. If cc is negative, then it may happen (rarely, but
 471  	 * not neglectibly so) that the result would be negative. In
 472  	 * order to avoid that, if cc is negative, then we add the
 473  	 * modulus once. Note that if cc is negative, then propagating
 474  	 * that carry must yield a value lower than the modulus, so
 475  	 * adding the modulus once will keep the final result under
 476  	 * twice the modulus.
 477  	 */
 478  	z = (uint32_t)cc;
 479  	d[3] -= z << 6;
 480  	d[6] -= (z << 12) & 0x3FFFFFFF;
 481  	d[7] -= ARSH(z, 18);
 482  	d[7] += (z << 14) & 0x3FFFFFFF;
 483  	d[8] += ARSH(z, 16);
 484  	c = z >> 31;
 485  	d[0] -= c;
 486  	d[3] += c << 6;
 487  	d[6] += c << 12;
 488  	d[7] -= c << 14;
 489  	d[8] += c << 16;
 490  	for (i = 0; i < 9; i ++) {
 491  		uint32_t w;
 492  
 493  		w = d[i] + z;
 494  		d[i] = w & 0x3FFFFFFF;
 495  		z = ARSH(w, 30);
 496  	}
 497  }
 498  
 499  /*
 500   * Compute a square in F256. Source operand shall be less than
 501   * twice the modulus.
 502   */
 503  static void
 504  square_f256(uint32_t *d, const uint32_t *a)
 505  {
 506  	uint32_t t[18];
 507  	uint64_t s[18];
 508  	uint64_t cc, x;
 509  	uint32_t z, c;
 510  	int i;
 511  
 512  	square9(t, a);
 513  
 514  	/*
 515  	 * Modular reduction: each high word in added/subtracted where
 516  	 * necessary.
 517  	 *
 518  	 * The modulus is:
 519  	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
 520  	 * Therefore:
 521  	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
 522  	 *
 523  	 * For a word x at bit offset n (n >= 256), we have:
 524  	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
 525  	 *            - x*2^(n - 160) + x*2^(n-256) mod p
 526  	 *
 527  	 * Thus, we can nullify the high word if we reinject it at some
 528  	 * proper emplacements.
 529  	 *
 530  	 * We use 64-bit intermediate words to allow for carries to
 531  	 * accumulate easily, before performing the final propagation.
 532  	 */
 533  	for (i = 0; i < 18; i ++) {
 534  		s[i] = t[i];
 535  	}
 536  
 537  	for (i = 17; i >= 9; i --) {
 538  		uint64_t y;
 539  
 540  		y = s[i];
 541  		s[i - 1] += ARSHW(y, 2);
 542  		s[i - 2] += (y << 28) & 0x3FFFFFFF;
 543  		s[i - 2] -= ARSHW(y, 4);
 544  		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
 545  		s[i - 5] -= ARSHW(y, 10);
 546  		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
 547  		s[i - 8] += ARSHW(y, 16);
 548  		s[i - 9] += (y << 14) & 0x3FFFFFFF;
 549  	}
 550  
 551  	/*
 552  	 * Carry propagation must be signed. Moreover, we may have overdone
 553  	 * it a bit, and obtain a negative result.
 554  	 *
 555  	 * The loop above ran 9 times; each time, each word was augmented
 556  	 * by at most one extra word (in absolute value). Thus, the top
 557  	 * word must in fine fit in 39 bits, so the carry below will fit
 558  	 * on 9 bits.
 559  	 */
 560  	cc = 0;
 561  	for (i = 0; i < 9; i ++) {
 562  		x = s[i] + cc;
 563  		d[i] = (uint32_t)x & 0x3FFFFFFF;
 564  		cc = ARSHW(x, 30);
 565  	}
 566  
 567  	/*
 568  	 * All nine words fit on 30 bits, but there may be an extra
 569  	 * carry for a few bits (at most 9), and that carry may be
 570  	 * negative. Moreover, we want the result to fit on 257 bits.
 571  	 * The two lines below ensure that the word in d[] has length
 572  	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
 573  	 * significant length of cc is less than 24 bits, so we will be
 574  	 * able to switch to 32-bit operations.
 575  	 */
 576  	cc = ARSHW(x, 16);
 577  	d[8] &= 0xFFFF;
 578  
 579  	/*
 580  	 * One extra round of reduction, for cc*2^256, which means
 581  	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
 582  	 * value. If cc is negative, then it may happen (rarely, but
 583  	 * not neglectibly so) that the result would be negative. In
 584  	 * order to avoid that, if cc is negative, then we add the
 585  	 * modulus once. Note that if cc is negative, then propagating
 586  	 * that carry must yield a value lower than the modulus, so
 587  	 * adding the modulus once will keep the final result under
 588  	 * twice the modulus.
 589  	 */
 590  	z = (uint32_t)cc;
 591  	d[3] -= z << 6;
 592  	d[6] -= (z << 12) & 0x3FFFFFFF;
 593  	d[7] -= ARSH(z, 18);
 594  	d[7] += (z << 14) & 0x3FFFFFFF;
 595  	d[8] += ARSH(z, 16);
 596  	c = z >> 31;
 597  	d[0] -= c;
 598  	d[3] += c << 6;
 599  	d[6] += c << 12;
 600  	d[7] -= c << 14;
 601  	d[8] += c << 16;
 602  	for (i = 0; i < 9; i ++) {
 603  		uint32_t w;
 604  
 605  		w = d[i] + z;
 606  		d[i] = w & 0x3FFFFFFF;
 607  		z = ARSH(w, 30);
 608  	}
 609  }
 610  
 611  /*
 612   * Perform a "final reduction" in field F256 (field for curve P-256).
 613   * The source value must be less than twice the modulus. If the value
 614   * is not lower than the modulus, then the modulus is subtracted and
 615   * this function returns 1; otherwise, it leaves it untouched and it
 616   * returns 0.
 617   */
 618  static uint32_t
 619  reduce_final_f256(uint32_t *d)
 620  {
 621  	uint32_t t[9];
 622  	uint32_t cc;
 623  	int i;
 624  
 625  	cc = 0;
 626  	for (i = 0; i < 9; i ++) {
 627  		uint32_t w;
 628  
 629  		w = d[i] - F256[i] - cc;
 630  		cc = w >> 31;
 631  		t[i] = w & 0x3FFFFFFF;
 632  	}
 633  	cc ^= 1;
 634  	CCOPY(cc, d, t, sizeof t);
 635  	return cc;
 636  }
 637  
 638  /*
 639   * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
 640   * are such that:
 641   *   X = x / z^2
 642   *   Y = y / z^3
 643   * For the point at infinity, z = 0.
 644   * Each point thus admits many possible representations.
 645   *
 646   * Coordinates are represented in arrays of 32-bit integers, each holding
 647   * 30 bits of data. Values may also be slightly greater than the modulus,
 648   * but they will always be lower than twice the modulus.
 649   */
 650  typedef struct {
 651  	uint32_t x[9];
 652  	uint32_t y[9];
 653  	uint32_t z[9];
 654  } p256_jacobian;
 655  
 656  /*
 657   * Convert a point to affine coordinates:
 658   *  - If the point is the point at infinity, then all three coordinates
 659   *    are set to 0.
 660   *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
 661   *    coordinates are the 'X' and 'Y' affine coordinates.
 662   * The coordinates are guaranteed to be lower than the modulus.
 663   */
 664  static void
 665  p256_to_affine(p256_jacobian *P)
 666  {
 667  	uint32_t t1[9], t2[9];
 668  	int i;
 669  
 670  	/*
 671  	 * Invert z with a modular exponentiation: the modulus is
 672  	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
 673  	 * p-2. Exponent bit pattern (from high to low) is:
 674  	 *  - 32 bits of value 1
 675  	 *  - 31 bits of value 0
 676  	 *  - 1 bit of value 1
 677  	 *  - 96 bits of value 0
 678  	 *  - 94 bits of value 1
 679  	 *  - 1 bit of value 0
 680  	 *  - 1 bit of value 1
 681  	 * Thus, we precompute z^(2^31-1) to speed things up.
 682  	 *
 683  	 * If z = 0 (point at infinity) then the modular exponentiation
 684  	 * will yield 0, which leads to the expected result (all three
 685  	 * coordinates set to 0).
 686  	 */
 687  
 688  	/*
 689  	 * A simple square-and-multiply for z^(2^31-1). We could save about
 690  	 * two dozen multiplications here with an addition chain, but
 691  	 * this would require a bit more code, and extra stack buffers.
 692  	 */
 693  	memcpy(t1, P->z, sizeof P->z);
 694  	for (i = 0; i < 30; i ++) {
 695  		square_f256(t1, t1);
 696  		mul_f256(t1, t1, P->z);
 697  	}
 698  
 699  	/*
 700  	 * Square-and-multiply. Apart from the squarings, we have a few
 701  	 * multiplications to set bits to 1; we multiply by the original z
 702  	 * for setting 1 bit, and by t1 for setting 31 bits.
 703  	 */
 704  	memcpy(t2, P->z, sizeof P->z);
 705  	for (i = 1; i < 256; i ++) {
 706  		square_f256(t2, t2);
 707  		switch (i) {
 708  		case 31:
 709  		case 190:
 710  		case 221:
 711  		case 252:
 712  			mul_f256(t2, t2, t1);
 713  			break;
 714  		case 63:
 715  		case 253:
 716  		case 255:
 717  			mul_f256(t2, t2, P->z);
 718  			break;
 719  		}
 720  	}
 721  
 722  	/*
 723  	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
 724  	 */
 725  	mul_f256(t1, t2, t2);
 726  	mul_f256(P->x, t1, P->x);
 727  	mul_f256(t1, t1, t2);
 728  	mul_f256(P->y, t1, P->y);
 729  	reduce_final_f256(P->x);
 730  	reduce_final_f256(P->y);
 731  
 732  	/*
 733  	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
 734  	 * this will set z to 1.
 735  	 */
 736  	mul_f256(P->z, P->z, t2);
 737  	reduce_final_f256(P->z);
 738  }
 739  
 740  /*
 741   * Double a point in P-256. This function works for all valid points,
 742   * including the point at infinity.
 743   */
 744  static void
 745  p256_double(p256_jacobian *Q)
 746  {
 747  	/*
 748  	 * Doubling formulas are:
 749  	 *
 750  	 *   s = 4*x*y^2
 751  	 *   m = 3*(x + z^2)*(x - z^2)
 752  	 *   x' = m^2 - 2*s
 753  	 *   y' = m*(s - x') - 8*y^4
 754  	 *   z' = 2*y*z
 755  	 *
 756  	 * These formulas work for all points, including points of order 2
 757  	 * and points at infinity:
 758  	 *   - If y = 0 then z' = 0. But there is no such point in P-256
 759  	 *     anyway.
 760  	 *   - If z = 0 then z' = 0.
 761  	 */
 762  	uint32_t t1[9], t2[9], t3[9], t4[9];
 763  
 764  	/*
 765  	 * Compute z^2 in t1.
 766  	 */
 767  	square_f256(t1, Q->z);
 768  
 769  	/*
 770  	 * Compute x-z^2 in t2 and x+z^2 in t1.
 771  	 */
 772  	add_f256(t2, Q->x, t1);
 773  	sub_f256(t1, Q->x, t1);
 774  
 775  	/*
 776  	 * Compute 3*(x+z^2)*(x-z^2) in t1.
 777  	 */
 778  	mul_f256(t3, t1, t2);
 779  	add_f256(t1, t3, t3);
 780  	add_f256(t1, t3, t1);
 781  
 782  	/*
 783  	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
 784  	 */
 785  	square_f256(t3, Q->y);
 786  	add_f256(t3, t3, t3);
 787  	mul_f256(t2, Q->x, t3);
 788  	add_f256(t2, t2, t2);
 789  
 790  	/*
 791  	 * Compute x' = m^2 - 2*s.
 792  	 */
 793  	square_f256(Q->x, t1);
 794  	sub_f256(Q->x, Q->x, t2);
 795  	sub_f256(Q->x, Q->x, t2);
 796  
 797  	/*
 798  	 * Compute z' = 2*y*z.
 799  	 */
 800  	mul_f256(t4, Q->y, Q->z);
 801  	add_f256(Q->z, t4, t4);
 802  
 803  	/*
 804  	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
 805  	 * 2*y^2 in t3.
 806  	 */
 807  	sub_f256(t2, t2, Q->x);
 808  	mul_f256(Q->y, t1, t2);
 809  	square_f256(t4, t3);
 810  	add_f256(t4, t4, t4);
 811  	sub_f256(Q->y, Q->y, t4);
 812  }
 813  
 814  /*
 815   * Add point P2 to point P1.
 816   *
 817   * This function computes the wrong result in the following cases:
 818   *
 819   *   - If P1 == 0 but P2 != 0
 820   *   - If P1 != 0 but P2 == 0
 821   *   - If P1 == P2
 822   *
 823   * In all three cases, P1 is set to the point at infinity.
 824   *
 825   * Returned value is 0 if one of the following occurs:
 826   *
 827   *   - P1 and P2 have the same Y coordinate
 828   *   - P1 == 0 and P2 == 0
 829   *   - The Y coordinate of one of the points is 0 and the other point is
 830   *     the point at infinity.
 831   *
 832   * The third case cannot actually happen with valid points, since a point
 833   * with Y == 0 is a point of order 2, and there is no point of order 2 on
 834   * curve P-256.
 835   *
 836   * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
 837   * can apply the following:
 838   *
 839   *   - If the result is not the point at infinity, then it is correct.
 840   *   - Otherwise, if the returned value is 1, then this is a case of
 841   *     P1+P2 == 0, so the result is indeed the point at infinity.
 842   *   - Otherwise, P1 == P2, so a "double" operation should have been
 843   *     performed.
 844   */
 845  static uint32_t
 846  p256_add(p256_jacobian *P1, const p256_jacobian *P2)
 847  {
 848  	/*
 849  	 * Addtions formulas are:
 850  	 *
 851  	 *   u1 = x1 * z2^2
 852  	 *   u2 = x2 * z1^2
 853  	 *   s1 = y1 * z2^3
 854  	 *   s2 = y2 * z1^3
 855  	 *   h = u2 - u1
 856  	 *   r = s2 - s1
 857  	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
 858  	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
 859  	 *   z3 = h * z1 * z2
 860  	 */
 861  	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
 862  	uint32_t ret;
 863  	int i;
 864  
 865  	/*
 866  	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
 867  	 */
 868  	square_f256(t3, P2->z);
 869  	mul_f256(t1, P1->x, t3);
 870  	mul_f256(t4, P2->z, t3);
 871  	mul_f256(t3, P1->y, t4);
 872  
 873  	/*
 874  	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
 875  	 */
 876  	square_f256(t4, P1->z);
 877  	mul_f256(t2, P2->x, t4);
 878  	mul_f256(t5, P1->z, t4);
 879  	mul_f256(t4, P2->y, t5);
 880  
 881  	/*
 882  	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
 883  	 * We need to test whether r is zero, so we will do some extra
 884  	 * reduce.
 885  	 */
 886  	sub_f256(t2, t2, t1);
 887  	sub_f256(t4, t4, t3);
 888  	reduce_final_f256(t4);
 889  	ret = 0;
 890  	for (i = 0; i < 9; i ++) {
 891  		ret |= t4[i];
 892  	}
 893  	ret = (ret | -ret) >> 31;
 894  
 895  	/*
 896  	 * Compute u1*h^2 (in t6) and h^3 (in t5);
 897  	 */
 898  	square_f256(t7, t2);
 899  	mul_f256(t6, t1, t7);
 900  	mul_f256(t5, t7, t2);
 901  
 902  	/*
 903  	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
 904  	 */
 905  	square_f256(P1->x, t4);
 906  	sub_f256(P1->x, P1->x, t5);
 907  	sub_f256(P1->x, P1->x, t6);
 908  	sub_f256(P1->x, P1->x, t6);
 909  
 910  	/*
 911  	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
 912  	 */
 913  	sub_f256(t6, t6, P1->x);
 914  	mul_f256(P1->y, t4, t6);
 915  	mul_f256(t1, t5, t3);
 916  	sub_f256(P1->y, P1->y, t1);
 917  
 918  	/*
 919  	 * Compute z3 = h*z1*z2.
 920  	 */
 921  	mul_f256(t1, P1->z, P2->z);
 922  	mul_f256(P1->z, t1, t2);
 923  
 924  	return ret;
 925  }
 926  
 927  /*
 928   * Add point P2 to point P1. This is a specialised function for the
 929   * case when P2 is a non-zero point in affine coordinate.
 930   *
 931   * This function computes the wrong result in the following cases:
 932   *
 933   *   - If P1 == 0
 934   *   - If P1 == P2
 935   *
 936   * In both cases, P1 is set to the point at infinity.
 937   *
 938   * Returned value is 0 if one of the following occurs:
 939   *
 940   *   - P1 and P2 have the same Y coordinate
 941   *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
 942   *
 943   * The second case cannot actually happen with valid points, since a point
 944   * with Y == 0 is a point of order 2, and there is no point of order 2 on
 945   * curve P-256.
 946   *
 947   * Therefore, assuming that P1 != 0 on input, then the caller
 948   * can apply the following:
 949   *
 950   *   - If the result is not the point at infinity, then it is correct.
 951   *   - Otherwise, if the returned value is 1, then this is a case of
 952   *     P1+P2 == 0, so the result is indeed the point at infinity.
 953   *   - Otherwise, P1 == P2, so a "double" operation should have been
 954   *     performed.
 955   */
 956  static uint32_t
 957  p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
 958  {
 959  	/*
 960  	 * Addtions formulas are:
 961  	 *
 962  	 *   u1 = x1
 963  	 *   u2 = x2 * z1^2
 964  	 *   s1 = y1
 965  	 *   s2 = y2 * z1^3
 966  	 *   h = u2 - u1
 967  	 *   r = s2 - s1
 968  	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
 969  	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
 970  	 *   z3 = h * z1
 971  	 */
 972  	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
 973  	uint32_t ret;
 974  	int i;
 975  
 976  	/*
 977  	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
 978  	 */
 979  	memcpy(t1, P1->x, sizeof t1);
 980  	memcpy(t3, P1->y, sizeof t3);
 981  
 982  	/*
 983  	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
 984  	 */
 985  	square_f256(t4, P1->z);
 986  	mul_f256(t2, P2->x, t4);
 987  	mul_f256(t5, P1->z, t4);
 988  	mul_f256(t4, P2->y, t5);
 989  
 990  	/*
 991  	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
 992  	 * We need to test whether r is zero, so we will do some extra
 993  	 * reduce.
 994  	 */
 995  	sub_f256(t2, t2, t1);
 996  	sub_f256(t4, t4, t3);
 997  	reduce_final_f256(t4);
 998  	ret = 0;
 999  	for (i = 0; i < 9; i ++) {
1000  		ret |= t4[i];
1001  	}
1002  	ret = (ret | -ret) >> 31;
1003  
1004  	/*
1005  	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1006  	 */
1007  	square_f256(t7, t2);
1008  	mul_f256(t6, t1, t7);
1009  	mul_f256(t5, t7, t2);
1010  
1011  	/*
1012  	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1013  	 */
1014  	square_f256(P1->x, t4);
1015  	sub_f256(P1->x, P1->x, t5);
1016  	sub_f256(P1->x, P1->x, t6);
1017  	sub_f256(P1->x, P1->x, t6);
1018  
1019  	/*
1020  	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1021  	 */
1022  	sub_f256(t6, t6, P1->x);
1023  	mul_f256(P1->y, t4, t6);
1024  	mul_f256(t1, t5, t3);
1025  	sub_f256(P1->y, P1->y, t1);
1026  
1027  	/*
1028  	 * Compute z3 = h*z1*z2.
1029  	 */
1030  	mul_f256(P1->z, P1->z, t2);
1031  
1032  	return ret;
1033  }
1034  
1035  /*
1036   * Decode a P-256 point. This function does not support the point at
1037   * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1038   */
1039  static uint32_t
1040  p256_decode(p256_jacobian *P, const void *src, size_t len)
1041  {
1042  	const unsigned char *buf;
1043  	uint32_t tx[9], ty[9], t1[9], t2[9];
1044  	uint32_t bad;
1045  	int i;
1046  
1047  	if (len != 65) {
1048  		return 0;
1049  	}
1050  	buf = src;
1051  
1052  	/*
1053  	 * First byte must be 0x04 (uncompressed format). We could support
1054  	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1055  	 * least significant bit of the Y coordinate), but it is explicitly
1056  	 * forbidden by RFC 5480 (section 2.2).
1057  	 */
1058  	bad = NEQ(buf[0], 0x04);
1059  
1060  	/*
1061  	 * Decode the coordinates, and check that they are both lower
1062  	 * than the modulus.
1063  	 */
1064  	tx[8] = be8_to_le30(tx, buf + 1, 32);
1065  	ty[8] = be8_to_le30(ty, buf + 33, 32);
1066  	bad |= reduce_final_f256(tx);
1067  	bad |= reduce_final_f256(ty);
1068  
1069  	/*
1070  	 * Check curve equation.
1071  	 */
1072  	square_f256(t1, tx);
1073  	mul_f256(t1, tx, t1);
1074  	square_f256(t2, ty);
1075  	sub_f256(t1, t1, tx);
1076  	sub_f256(t1, t1, tx);
1077  	sub_f256(t1, t1, tx);
1078  	add_f256(t1, t1, P256_B);
1079  	sub_f256(t1, t1, t2);
1080  	reduce_final_f256(t1);
1081  	for (i = 0; i < 9; i ++) {
1082  		bad |= t1[i];
1083  	}
1084  
1085  	/*
1086  	 * Copy coordinates to the point structure.
1087  	 */
1088  	memcpy(P->x, tx, sizeof tx);
1089  	memcpy(P->y, ty, sizeof ty);
1090  	memset(P->z, 0, sizeof P->z);
1091  	P->z[0] = 1;
1092  	return EQ(bad, 0);
1093  }
1094  
1095  /*
1096   * Encode a point into a buffer. This function assumes that the point is
1097   * valid, in affine coordinates, and not the point at infinity.
1098   */
1099  static void
1100  p256_encode(void *dst, const p256_jacobian *P)
1101  {
1102  	unsigned char *buf;
1103  
1104  	buf = dst;
1105  	buf[0] = 0x04;
1106  	le30_to_be8(buf + 1, 32, P->x);
1107  	le30_to_be8(buf + 33, 32, P->y);
1108  }
1109  
1110  /*
1111   * Multiply a curve point by an integer. The integer is assumed to be
1112   * lower than the curve order, and the base point must not be the point
1113   * at infinity.
1114   */
1115  static void
1116  p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1117  {
1118  	/*
1119  	 * qz is a flag that is initially 1, and remains equal to 1
1120  	 * as long as the point is the point at infinity.
1121  	 *
1122  	 * We use a 2-bit window to handle multiplier bits by pairs.
1123  	 * The precomputed window really is the points P2 and P3.
1124  	 */
1125  	uint32_t qz;
1126  	p256_jacobian P2, P3, Q, T, U;
1127  
1128  	/*
1129  	 * Compute window values.
1130  	 */
1131  	P2 = *P;
1132  	p256_double(&P2);
1133  	P3 = *P;
1134  	p256_add(&P3, &P2);
1135  
1136  	/*
1137  	 * We start with Q = 0. We process multiplier bits 2 by 2.
1138  	 */
1139  	memset(&Q, 0, sizeof Q);
1140  	qz = 1;
1141  	while (xlen -- > 0) {
1142  		int k;
1143  
1144  		for (k = 6; k >= 0; k -= 2) {
1145  			uint32_t bits;
1146  			uint32_t bnz;
1147  
1148  			p256_double(&Q);
1149  			p256_double(&Q);
1150  			T = *P;
1151  			U = Q;
1152  			bits = (*x >> k) & (uint32_t)3;
1153  			bnz = NEQ(bits, 0);
1154  			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1155  			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1156  			p256_add(&U, &T);
1157  			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1158  			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1159  			qz &= ~bnz;
1160  		}
1161  		x ++;
1162  	}
1163  	*P = Q;
1164  }
1165  
1166  /*
1167   * Precomputed window: k*G points, where G is the curve generator, and k
1168   * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1169   * the point are encoded as 9 words of 30 bits each (little-endian
1170   * order).
1171   */
1172  static const uint32_t Gwin[15][18] = {
1173  
1174  	{ 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1175  	  0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1176  	  0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1177  	  0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1178  	  0x10B8BF86, 0x00004FE3 },
1179  
1180  	{ 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1181  	  0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1182  	  0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1183  	  0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1184  	  0x154436E3, 0x00000777 },
1185  
1186  	{ 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1187  	  0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1188  	  0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1189  	  0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1190  	  0x19031266, 0x00008734 },
1191  
1192  	{ 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1193  	  0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1194  	  0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1195  	  0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1196  	  0x15D69318, 0x0000E0F1 },
1197  
1198  	{ 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1199  	  0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1200  	  0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1201  	  0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1202  	  0x1F6A2412, 0x0000E0C1 },
1203  
1204  	{ 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1205  	  0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1206  	  0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1207  	  0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1208  	  0x041D0C8D, 0x0000E85C },
1209  
1210  	{ 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1211  	  0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1212  	  0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1213  	  0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1214  	  0x076F780C, 0x000073EB },
1215  
1216  	{ 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1217  	  0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1218  	  0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1219  	  0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1220  	  0x332F647A, 0x0000AD5A },
1221  
1222  	{ 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1223  	  0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1224  	  0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1225  	  0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1226  	  0x11325CB2, 0x00002A27 },
1227  
1228  	{ 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1229  	  0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1230  	  0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1231  	  0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1232  	  0x18A88A6A, 0x00008786 },
1233  
1234  	{ 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1235  	  0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1236  	  0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1237  	  0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1238  	  0x0826B331, 0x00009099 },
1239  
1240  	{ 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1241  	  0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1242  	  0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1243  	  0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1244  	  0x2D1AA70E, 0x00000770 },
1245  
1246  	{ 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1247  	  0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1248  	  0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1249  	  0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1250  	  0x163353AF, 0x000063BB },
1251  
1252  	{ 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1253  	  0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1254  	  0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1255  	  0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1256  	  0x3C6ECA7D, 0x0000F599 },
1257  
1258  	{ 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1259  	  0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1260  	  0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1261  	  0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1262  	  0x0FB8D64B, 0x0000B5B9 }
1263  };
1264  
1265  /*
1266   * Lookup one of the Gwin[] values, by index. This is constant-time.
1267   */
1268  static void
1269  lookup_Gwin(p256_jacobian *T, uint32_t idx)
1270  {
1271  	uint32_t xy[18];
1272  	uint32_t k;
1273  	size_t u;
1274  
1275  	memset(xy, 0, sizeof xy);
1276  	for (k = 0; k < 15; k ++) {
1277  		uint32_t m;
1278  
1279  		m = -EQ(idx, k + 1);
1280  		for (u = 0; u < 18; u ++) {
1281  			xy[u] |= m & Gwin[k][u];
1282  		}
1283  	}
1284  	memcpy(T->x, &xy[0], sizeof T->x);
1285  	memcpy(T->y, &xy[9], sizeof T->y);
1286  	memset(T->z, 0, sizeof T->z);
1287  	T->z[0] = 1;
1288  }
1289  
1290  /*
1291   * Multiply the generator by an integer. The integer is assumed non-zero
1292   * and lower than the curve order.
1293   */
1294  static void
1295  p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1296  {
1297  	/*
1298  	 * qz is a flag that is initially 1, and remains equal to 1
1299  	 * as long as the point is the point at infinity.
1300  	 *
1301  	 * We use a 4-bit window to handle multiplier bits by groups
1302  	 * of 4. The precomputed window is constant static data, with
1303  	 * points in affine coordinates; we use a constant-time lookup.
1304  	 */
1305  	p256_jacobian Q;
1306  	uint32_t qz;
1307  
1308  	memset(&Q, 0, sizeof Q);
1309  	qz = 1;
1310  	while (xlen -- > 0) {
1311  		int k;
1312  		unsigned bx;
1313  
1314  		bx = *x ++;
1315  		for (k = 0; k < 2; k ++) {
1316  			uint32_t bits;
1317  			uint32_t bnz;
1318  			p256_jacobian T, U;
1319  
1320  			p256_double(&Q);
1321  			p256_double(&Q);
1322  			p256_double(&Q);
1323  			p256_double(&Q);
1324  			bits = (bx >> 4) & 0x0F;
1325  			bnz = NEQ(bits, 0);
1326  			lookup_Gwin(&T, bits);
1327  			U = Q;
1328  			p256_add_mixed(&U, &T);
1329  			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1330  			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1331  			qz &= ~bnz;
1332  			bx <<= 4;
1333  		}
1334  	}
1335  	*P = Q;
1336  }
1337  
1338  static const unsigned char P256_G[] = {
1339  	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1340  	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1341  	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1342  	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1343  	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1344  	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1345  	0x68, 0x37, 0xBF, 0x51, 0xF5
1346  };
1347  
1348  static const unsigned char P256_N[] = {
1349  	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1350  	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1351  	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1352  	0x25, 0x51
1353  };
1354  
1355  static const unsigned char *
1356  api_generator(int curve, size_t *len)
1357  {
1358  	(void)curve;
1359  	*len = sizeof P256_G;
1360  	return P256_G;
1361  }
1362  
1363  static const unsigned char *
1364  api_order(int curve, size_t *len)
1365  {
1366  	(void)curve;
1367  	*len = sizeof P256_N;
1368  	return P256_N;
1369  }
1370  
1371  static size_t
1372  api_xoff(int curve, size_t *len)
1373  {
1374  	(void)curve;
1375  	*len = 32;
1376  	return 1;
1377  }
1378  
1379  static uint32_t
1380  api_mul(unsigned char *G, size_t Glen,
1381  	const unsigned char *x, size_t xlen, int curve)
1382  {
1383  	uint32_t r;
1384  	p256_jacobian P;
1385  
1386  	(void)curve;
1387  	if (Glen != 65) {
1388  		return 0;
1389  	}
1390  	r = p256_decode(&P, G, Glen);
1391  	p256_mul(&P, x, xlen);
1392  	p256_to_affine(&P);
1393  	p256_encode(G, &P);
1394  	return r;
1395  }
1396  
1397  static size_t
1398  api_mulgen(unsigned char *R,
1399  	const unsigned char *x, size_t xlen, int curve)
1400  {
1401  	p256_jacobian P;
1402  
1403  	(void)curve;
1404  	p256_mulgen(&P, x, xlen);
1405  	p256_to_affine(&P);
1406  	p256_encode(R, &P);
1407  	return 65;
1408  }
1409  
1410  static uint32_t
1411  api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1412  	const unsigned char *x, size_t xlen,
1413  	const unsigned char *y, size_t ylen, int curve)
1414  {
1415  	p256_jacobian P, Q;
1416  	uint32_t r, t, z;
1417  	int i;
1418  
1419  	(void)curve;
1420  	if (len != 65) {
1421  		return 0;
1422  	}
1423  	r = p256_decode(&P, A, len);
1424  	p256_mul(&P, x, xlen);
1425  	if (B == NULL) {
1426  		p256_mulgen(&Q, y, ylen);
1427  	} else {
1428  		r &= p256_decode(&Q, B, len);
1429  		p256_mul(&Q, y, ylen);
1430  	}
1431  
1432  	/*
1433  	 * The final addition may fail in case both points are equal.
1434  	 */
1435  	t = p256_add(&P, &Q);
1436  	reduce_final_f256(P.z);
1437  	z = 0;
1438  	for (i = 0; i < 9; i ++) {
1439  		z |= P.z[i];
1440  	}
1441  	z = EQ(z, 0);
1442  	p256_double(&Q);
1443  
1444  	/*
1445  	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1446  	 * have the following:
1447  	 *
1448  	 *   z = 0, t = 0   return P (normal addition)
1449  	 *   z = 0, t = 1   return P (normal addition)
1450  	 *   z = 1, t = 0   return Q (a 'double' case)
1451  	 *   z = 1, t = 1   report an error (P+Q = 0)
1452  	 */
1453  	CCOPY(z & ~t, &P, &Q, sizeof Q);
1454  	p256_to_affine(&P);
1455  	p256_encode(A, &P);
1456  	r &= ~(z & t);
1457  	return r;
1458  }
1459  
1460  /* see bearssl_ec.h */
1461  const br_ec_impl br_ec_p256_m31 = {
1462  	(uint32_t)0x00800000,
1463  	&api_generator,
1464  	&api_order,
1465  	&api_xoff,
1466  	&api_mul,
1467  	&api_mulgen,
1468  	&api_muladd
1469  };