/ src / ec / ec_c25519_m31.c
ec_c25519_m31.c
  1  /*
  2   * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
  3   *
  4   * Permission is hereby granted, free of charge, to any person obtaining 
  5   * a copy of this software and associated documentation files (the
  6   * "Software"), to deal in the Software without restriction, including
  7   * without limitation the rights to use, copy, modify, merge, publish,
  8   * distribute, sublicense, and/or sell copies of the Software, and to
  9   * permit persons to whom the Software is furnished to do so, subject to
 10   * the following conditions:
 11   *
 12   * The above copyright notice and this permission notice shall be 
 13   * included in all copies or substantial portions of the Software.
 14   *
 15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 18   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 19   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 20   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22   * SOFTWARE.
 23   */
 24  
 25  #include "inner.h"
 26  
 27  /* obsolete
 28  #include <stdio.h>
 29  #include <stdlib.h>
 30  static void
 31  print_int(const char *name, const uint32_t *x)
 32  {
 33  	size_t u;
 34  	unsigned char tmp[40];
 35  
 36  	printf("%s = ", name);
 37  	for (u = 0; u < 9; u ++) {
 38  		if (x[u] > 0x3FFFFFFF) {
 39  			printf("INVALID:");
 40  			for (u = 0; u < 9; u ++) {
 41  				printf(" %08X", x[u]);
 42  			}
 43  			printf("\n");
 44  			return;
 45  		}
 46  	}
 47  	memset(tmp, 0, sizeof tmp);
 48  	for (u = 0; u < 9; u ++) {
 49  		uint64_t w;
 50  		int j, k;
 51  
 52  		w = x[u];
 53  		j = 30 * (int)u;
 54  		k = j & 7;
 55  		if (k != 0) {
 56  			w <<= k;
 57  			j -= k;
 58  		}
 59  		k = j >> 3;
 60  		for (j = 0; j < 8; j ++) {
 61  			tmp[39 - k - j] |= (unsigned char)w;
 62  			w >>= 8;
 63  		}
 64  	}
 65  	for (u = 8; u < 40; u ++) {
 66  		printf("%02X", tmp[u]);
 67  	}
 68  	printf("\n");
 69  }
 70  */
 71  
 72  /*
 73   * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
 74   * that right-shifting a signed negative integer copies the sign bit
 75   * (arithmetic right-shift). This is "implementation-defined behaviour",
 76   * i.e. it is not undefined, but it may differ between compilers. Each
 77   * compiler is supposed to document its behaviour in that respect. GCC
 78   * explicitly defines that an arithmetic right shift is used. We expect
 79   * all other compilers to do the same, because underlying CPU offer an
 80   * arithmetic right shift opcode that could not be used otherwise.
 81   */
 82  #if BR_NO_ARITH_SHIFT
 83  #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
 84                      | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
 85  #else
 86  #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
 87  #endif
 88  
 89  /*
 90   * Convert an integer from unsigned little-endian encoding to a sequence of
 91   * 30-bit words in little-endian order. The final "partial" word is
 92   * returned.
 93   */
 94  static uint32_t
 95  le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
 96  {
 97  	uint32_t acc;
 98  	int acc_len;
 99  
100  	acc = 0;
101  	acc_len = 0;
102  	while (len -- > 0) {
103  		uint32_t b;
104  
105  		b = *src ++;
106  		if (acc_len < 22) {
107  			acc |= b << acc_len;
108  			acc_len += 8;
109  		} else {
110  			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
111  			acc = b >> (30 - acc_len);
112  			acc_len -= 22;
113  		}
114  	}
115  	return acc;
116  }
117  
118  /*
119   * Convert an integer (30-bit words, little-endian) to unsigned
120   * little-endian encoding. The total encoding length is provided; all
121   * the destination bytes will be filled.
122   */
123  static void
124  le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
125  {
126  	uint32_t acc;
127  	int acc_len;
128  
129  	acc = 0;
130  	acc_len = 0;
131  	while (len -- > 0) {
132  		if (acc_len < 8) {
133  			uint32_t w;
134  
135  			w = *src ++;
136  			*dst ++ = (unsigned char)(acc | (w << acc_len));
137  			acc = w >> (8 - acc_len);
138  			acc_len += 22;
139  		} else {
140  			*dst ++ = (unsigned char)acc;
141  			acc >>= 8;
142  			acc_len -= 8;
143  		}
144  	}
145  }
146  
147  /*
148   * Multiply two integers. Source integers are represented as arrays of
149   * nine 30-bit words, for values up to 2^270-1. Result is encoded over
150   * 18 words of 30 bits each.
151   */
152  static void
153  mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
154  {
155  	/*
156  	 * Maximum intermediate result is no more than
157  	 * 10376293531797946367, which fits in 64 bits. Reason:
158  	 *
159  	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
160  	 *   10376293531797946367 < 9663676407 * 2^30
161  	 *
162  	 * Thus, adding together 9 products of 30-bit integers, with
163  	 * a carry of at most 9663676406, yields an integer that fits
164  	 * on 64 bits and generates a carry of at most 9663676406.
165  	 */
166  	uint64_t t[17];
167  	uint64_t cc;
168  	int i;
169  
170  	t[ 0] = MUL31(a[0], b[0]);
171  	t[ 1] = MUL31(a[0], b[1])
172  		+ MUL31(a[1], b[0]);
173  	t[ 2] = MUL31(a[0], b[2])
174  		+ MUL31(a[1], b[1])
175  		+ MUL31(a[2], b[0]);
176  	t[ 3] = MUL31(a[0], b[3])
177  		+ MUL31(a[1], b[2])
178  		+ MUL31(a[2], b[1])
179  		+ MUL31(a[3], b[0]);
180  	t[ 4] = MUL31(a[0], b[4])
181  		+ MUL31(a[1], b[3])
182  		+ MUL31(a[2], b[2])
183  		+ MUL31(a[3], b[1])
184  		+ MUL31(a[4], b[0]);
185  	t[ 5] = MUL31(a[0], b[5])
186  		+ MUL31(a[1], b[4])
187  		+ MUL31(a[2], b[3])
188  		+ MUL31(a[3], b[2])
189  		+ MUL31(a[4], b[1])
190  		+ MUL31(a[5], b[0]);
191  	t[ 6] = MUL31(a[0], b[6])
192  		+ MUL31(a[1], b[5])
193  		+ MUL31(a[2], b[4])
194  		+ MUL31(a[3], b[3])
195  		+ MUL31(a[4], b[2])
196  		+ MUL31(a[5], b[1])
197  		+ MUL31(a[6], b[0]);
198  	t[ 7] = MUL31(a[0], b[7])
199  		+ MUL31(a[1], b[6])
200  		+ MUL31(a[2], b[5])
201  		+ MUL31(a[3], b[4])
202  		+ MUL31(a[4], b[3])
203  		+ MUL31(a[5], b[2])
204  		+ MUL31(a[6], b[1])
205  		+ MUL31(a[7], b[0]);
206  	t[ 8] = MUL31(a[0], b[8])
207  		+ MUL31(a[1], b[7])
208  		+ MUL31(a[2], b[6])
209  		+ MUL31(a[3], b[5])
210  		+ MUL31(a[4], b[4])
211  		+ MUL31(a[5], b[3])
212  		+ MUL31(a[6], b[2])
213  		+ MUL31(a[7], b[1])
214  		+ MUL31(a[8], b[0]);
215  	t[ 9] = MUL31(a[1], b[8])
216  		+ MUL31(a[2], b[7])
217  		+ MUL31(a[3], b[6])
218  		+ MUL31(a[4], b[5])
219  		+ MUL31(a[5], b[4])
220  		+ MUL31(a[6], b[3])
221  		+ MUL31(a[7], b[2])
222  		+ MUL31(a[8], b[1]);
223  	t[10] = MUL31(a[2], b[8])
224  		+ MUL31(a[3], b[7])
225  		+ MUL31(a[4], b[6])
226  		+ MUL31(a[5], b[5])
227  		+ MUL31(a[6], b[4])
228  		+ MUL31(a[7], b[3])
229  		+ MUL31(a[8], b[2]);
230  	t[11] = MUL31(a[3], b[8])
231  		+ MUL31(a[4], b[7])
232  		+ MUL31(a[5], b[6])
233  		+ MUL31(a[6], b[5])
234  		+ MUL31(a[7], b[4])
235  		+ MUL31(a[8], b[3]);
236  	t[12] = MUL31(a[4], b[8])
237  		+ MUL31(a[5], b[7])
238  		+ MUL31(a[6], b[6])
239  		+ MUL31(a[7], b[5])
240  		+ MUL31(a[8], b[4]);
241  	t[13] = MUL31(a[5], b[8])
242  		+ MUL31(a[6], b[7])
243  		+ MUL31(a[7], b[6])
244  		+ MUL31(a[8], b[5]);
245  	t[14] = MUL31(a[6], b[8])
246  		+ MUL31(a[7], b[7])
247  		+ MUL31(a[8], b[6]);
248  	t[15] = MUL31(a[7], b[8])
249  		+ MUL31(a[8], b[7]);
250  	t[16] = MUL31(a[8], b[8]);
251  
252  	/*
253  	 * Propagate carries.
254  	 */
255  	cc = 0;
256  	for (i = 0; i < 17; i ++) {
257  		uint64_t w;
258  
259  		w = t[i] + cc;
260  		d[i] = (uint32_t)w & 0x3FFFFFFF;
261  		cc = w >> 30;
262  	}
263  	d[17] = (uint32_t)cc;
264  }
265  
266  /*
267   * Square a 270-bit integer, represented as an array of nine 30-bit words.
268   * Result uses 18 words of 30 bits each.
269   */
270  static void
271  square9(uint32_t *d, const uint32_t *a)
272  {
273  	uint64_t t[17];
274  	uint64_t cc;
275  	int i;
276  
277  	t[ 0] = MUL31(a[0], a[0]);
278  	t[ 1] = ((MUL31(a[0], a[1])) << 1);
279  	t[ 2] = MUL31(a[1], a[1])
280  		+ ((MUL31(a[0], a[2])) << 1);
281  	t[ 3] = ((MUL31(a[0], a[3])
282  		+ MUL31(a[1], a[2])) << 1);
283  	t[ 4] = MUL31(a[2], a[2])
284  		+ ((MUL31(a[0], a[4])
285  		+ MUL31(a[1], a[3])) << 1);
286  	t[ 5] = ((MUL31(a[0], a[5])
287  		+ MUL31(a[1], a[4])
288  		+ MUL31(a[2], a[3])) << 1);
289  	t[ 6] = MUL31(a[3], a[3])
290  		+ ((MUL31(a[0], a[6])
291  		+ MUL31(a[1], a[5])
292  		+ MUL31(a[2], a[4])) << 1);
293  	t[ 7] = ((MUL31(a[0], a[7])
294  		+ MUL31(a[1], a[6])
295  		+ MUL31(a[2], a[5])
296  		+ MUL31(a[3], a[4])) << 1);
297  	t[ 8] = MUL31(a[4], a[4])
298  		+ ((MUL31(a[0], a[8])
299  		+ MUL31(a[1], a[7])
300  		+ MUL31(a[2], a[6])
301  		+ MUL31(a[3], a[5])) << 1);
302  	t[ 9] = ((MUL31(a[1], a[8])
303  		+ MUL31(a[2], a[7])
304  		+ MUL31(a[3], a[6])
305  		+ MUL31(a[4], a[5])) << 1);
306  	t[10] = MUL31(a[5], a[5])
307  		+ ((MUL31(a[2], a[8])
308  		+ MUL31(a[3], a[7])
309  		+ MUL31(a[4], a[6])) << 1);
310  	t[11] = ((MUL31(a[3], a[8])
311  		+ MUL31(a[4], a[7])
312  		+ MUL31(a[5], a[6])) << 1);
313  	t[12] = MUL31(a[6], a[6])
314  		+ ((MUL31(a[4], a[8])
315  		+ MUL31(a[5], a[7])) << 1);
316  	t[13] = ((MUL31(a[5], a[8])
317  		+ MUL31(a[6], a[7])) << 1);
318  	t[14] = MUL31(a[7], a[7])
319  		+ ((MUL31(a[6], a[8])) << 1);
320  	t[15] = ((MUL31(a[7], a[8])) << 1);
321  	t[16] = MUL31(a[8], a[8]);
322  
323  	/*
324  	 * Propagate carries.
325  	 */
326  	cc = 0;
327  	for (i = 0; i < 17; i ++) {
328  		uint64_t w;
329  
330  		w = t[i] + cc;
331  		d[i] = (uint32_t)w & 0x3FFFFFFF;
332  		cc = w >> 30;
333  	}
334  	d[17] = (uint32_t)cc;
335  }
336  
337  /*
338   * Perform a "final reduction" in field F255 (field for Curve25519)
339   * The source value must be less than twice the modulus. If the value
340   * is not lower than the modulus, then the modulus is subtracted and
341   * this function returns 1; otherwise, it leaves it untouched and it
342   * returns 0.
343   */
344  static uint32_t
345  reduce_final_f255(uint32_t *d)
346  {
347  	uint32_t t[9];
348  	uint32_t cc;
349  	int i;
350  
351  	memcpy(t, d, sizeof t);
352  	cc = 19;
353  	for (i = 0; i < 9; i ++) {
354  		uint32_t w;
355  
356  		w = t[i] + cc;
357  		cc = w >> 30;
358  		t[i] = w & 0x3FFFFFFF;
359  	}
360  	cc = t[8] >> 15;
361  	t[8] &= 0x7FFF;
362  	CCOPY(cc, d, t, sizeof t);
363  	return cc;
364  }
365  
366  /*
367   * Perform a multiplication of two integers modulo 2^255-19.
368   * Operands are arrays of 9 words, each containing 30 bits of data, in
369   * little-endian order. Input value may be up to 2^256-1; on output, value
370   * fits on 256 bits and is lower than twice the modulus.
371   */
372  static void
373  f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
374  {
375  	uint32_t t[18], cc;
376  	int i;
377  
378  	/*
379  	 * Compute raw multiplication. All result words fit in 30 bits
380  	 * each; upper word (t[17]) must fit on 2 bits, since the product
381  	 * of two 256-bit integers must fit on 512 bits.
382  	 */
383  	mul9(t, a, b);
384  
385  	/*
386  	 * Modular reduction: each high word is added where necessary.
387  	 * Since the modulus is 2^255-19 and word 9 corresponds to
388  	 * offset 9*30 = 270, word 9+k must be added to word k with
389  	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
390  	 * added that way.
391  	 *
392  	 * Keeping the carry on 32 bits helps with 32-bit architectures,
393  	 * and does not noticeably impact performance on 64-bit systems.
394  	 */
395  	cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
396  	t[8] &= 0x7FFF;
397  	for (i = 0; i < 9; i ++) {
398  		uint64_t w;
399  
400  		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
401  		t[i] = (uint32_t)w & 0x3FFFFFFF;
402  		cc = (uint32_t)(w >> 30);  /* at most 622592 */
403  	}
404  
405  	/*
406  	 * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
407  	 * This was split into two parts (upper of 257 bits, lower of 255
408  	 * bits), and the upper was added to the lower with a factor 19,
409  	 * which means that the intermediate value is less than 77*2^255
410  	 * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
411  	 * less than 77, and the initial carry cc is at most 76*19 = 1444.
412  	 */
413  	cc = MUL15(t[8] >> 15, 19);
414  	t[8] &= 0x7FFF;
415  	for (i = 0; i < 9; i ++) {
416  		uint32_t z;
417  
418  		z = t[i] + cc;
419  		d[i] = z & 0x3FFFFFFF;
420  		cc = z >> 30;
421  	}
422  
423  	/*
424  	 * Final result is at most 2^255 + 1443. In particular, the last
425  	 * carry is necessarily 0, since t[8] was truncated to 15 bits.
426  	 */
427  }
428  
429  /*
430   * Perform a squaring of an integer modulo 2^255-19.
431   * Operands are arrays of 9 words, each containing 30 bits of data, in
432   * little-endian order. Input value may be up to 2^256-1; on output, value
433   * fits on 256 bits and is lower than twice the modulus.
434   */
435  static void
436  f255_square(uint32_t *d, const uint32_t *a)
437  {
438  	uint32_t t[18], cc;
439  	int i;
440  
441  	/*
442  	 * Compute raw squaring. All result words fit in 30 bits
443  	 * each; upper word (t[17]) must fit on 2 bits, since the square
444  	 * of a 256-bit integers must fit on 512 bits.
445  	 */
446  	square9(t, a);
447  
448  	/*
449  	 * Modular reduction: each high word is added where necessary.
450  	 * See f255_mul() for details on the reduction and carry limits.
451  	 */
452  	cc = MUL15(t[8] >> 15, 19);
453  	t[8] &= 0x7FFF;
454  	for (i = 0; i < 9; i ++) {
455  		uint64_t w;
456  
457  		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
458  		t[i] = (uint32_t)w & 0x3FFFFFFF;
459  		cc = (uint32_t)(w >> 30);
460  	}
461  	cc = MUL15(t[8] >> 15, 19);
462  	t[8] &= 0x7FFF;
463  	for (i = 0; i < 9; i ++) {
464  		uint32_t z;
465  
466  		z = t[i] + cc;
467  		d[i] = z & 0x3FFFFFFF;
468  		cc = z >> 30;
469  	}
470  }
471  
472  /*
473   * Add two values in F255. Partial reduction is performed (down to less
474   * than twice the modulus).
475   */
476  static void
477  f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
478  {
479  	/*
480  	 * Since operand words fit on 30 bits, we can use 32-bit
481  	 * variables throughout.
482  	 */
483  	int i;
484  	uint32_t cc, w;
485  
486  	cc = 0;
487  	for (i = 0; i < 9; i ++) {
488  		w = a[i] + b[i] + cc;
489  		d[i] = w & 0x3FFFFFFF;
490  		cc = w >> 30;
491  	}
492  	cc = MUL15(w >> 15, 19);
493  	d[8] &= 0x7FFF;
494  	for (i = 0; i < 9; i ++) {
495  		w = d[i] + cc;
496  		d[i] = w & 0x3FFFFFFF;
497  		cc = w >> 30;
498  	}
499  }
500  
501  /*
502   * Subtract one value from another in F255. Partial reduction is
503   * performed (down to less than twice the modulus).
504   */
505  static void
506  f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
507  {
508  	/*
509  	 * We actually compute a - b + 2*p, so that the final value is
510  	 * necessarily positive.
511  	 */
512  	int i;
513  	uint32_t cc, w;
514  
515  	cc = (uint32_t)-38;
516  	for (i = 0; i < 9; i ++) {
517  		w = a[i] - b[i] + cc;
518  		d[i] = w & 0x3FFFFFFF;
519  		cc = ARSH(w, 30);
520  	}
521  	cc = MUL15((w + 0x10000) >> 15, 19);
522  	d[8] &= 0x7FFF;
523  	for (i = 0; i < 9; i ++) {
524  		w = d[i] + cc;
525  		d[i] = w & 0x3FFFFFFF;
526  		cc = w >> 30;
527  	}
528  }
529  
530  /*
531   * Multiply an integer by the 'A24' constant (121665). Partial reduction
532   * is performed (down to less than twice the modulus).
533   */
534  static void
535  f255_mul_a24(uint32_t *d, const uint32_t *a)
536  {
537  	int i;
538  	uint64_t w;
539  	uint32_t cc;
540  
541  	/*
542  	 * a[] is over 256 bits, thus a[8] has length at most 16 bits.
543  	 * We single out the processing of the last word: intermediate
544  	 * value w is up to 121665*2^16, yielding a carry for the next
545  	 * loop of at most 19*(121665*2^16/2^15) = 4623289.
546  	 */
547  	cc = 0;
548  	for (i = 0; i < 8; i ++) {
549  		w = MUL31(a[i], 121665) + (uint64_t)cc;
550  		d[i] = (uint32_t)w & 0x3FFFFFFF;
551  		cc = (uint32_t)(w >> 30);
552  	}
553  	w = MUL31(a[8], 121665) + (uint64_t)cc;
554  	d[8] = (uint32_t)w & 0x7FFF;
555  	cc = MUL15((uint32_t)(w >> 15), 19);
556  
557  	for (i = 0; i < 9; i ++) {
558  		uint32_t z;
559  
560  		z = d[i] + cc;
561  		d[i] = z & 0x3FFFFFFF;
562  		cc = z >> 30;
563  	}
564  }
565  
566  static const unsigned char GEN[] = {
567  	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
568  	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
569  	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
570  	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
571  };
572  
573  static const unsigned char ORDER[] = {
574  	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
575  	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
576  	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
577  	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
578  };
579  
580  static const unsigned char *
581  api_generator(int curve, size_t *len)
582  {
583  	(void)curve;
584  	*len = 32;
585  	return GEN;
586  }
587  
588  static const unsigned char *
589  api_order(int curve, size_t *len)
590  {
591  	(void)curve;
592  	*len = 32;
593  	return ORDER;
594  }
595  
596  static size_t
597  api_xoff(int curve, size_t *len)
598  {
599  	(void)curve;
600  	*len = 32;
601  	return 0;
602  }
603  
604  static void
605  cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
606  {
607  	int i;
608  
609  	ctl = -ctl;
610  	for (i = 0; i < 9; i ++) {
611  		uint32_t aw, bw, tw;
612  
613  		aw = a[i];
614  		bw = b[i];
615  		tw = ctl & (aw ^ bw);
616  		a[i] = aw ^ tw;
617  		b[i] = bw ^ tw;
618  	}
619  }
620  
621  static uint32_t
622  api_mul(unsigned char *G, size_t Glen,
623  	const unsigned char *kb, size_t kblen, int curve)
624  {
625  	uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
626  	uint32_t a[9], aa[9], b[9], bb[9];
627  	uint32_t c[9], d[9], e[9], da[9], cb[9];
628  	unsigned char k[32];
629  	uint32_t swap;
630  	int i;
631  
632  	(void)curve;
633  
634  	/*
635  	 * Points are encoded over exactly 32 bytes. Multipliers must fit
636  	 * in 32 bytes as well.
637  	 * RFC 7748 mandates that the high bit of the last point byte must
638  	 * be ignored/cleared.
639  	 */
640  	if (Glen != 32 || kblen > 32) {
641  		return 0;
642  	}
643  	G[31] &= 0x7F;
644  
645  	/*
646  	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
647  	 * into Montgomery representation.
648  	 */
649  	x1[8] = le8_to_le30(x1, G, 32);
650  	memcpy(x3, x1, sizeof x1);
651  	memset(z2, 0, sizeof z2);
652  	memset(x2, 0, sizeof x2);
653  	x2[0] = 1;
654  	memset(z3, 0, sizeof z3);
655  	z3[0] = 1;
656  
657  	memset(k, 0, (sizeof k) - kblen);
658  	memcpy(k + (sizeof k) - kblen, kb, kblen);
659  	k[31] &= 0xF8;
660  	k[0] &= 0x7F;
661  	k[0] |= 0x40;
662  
663  	/* obsolete
664  	print_int("x1", x1);
665  	*/
666  
667  	swap = 0;
668  	for (i = 254; i >= 0; i --) {
669  		uint32_t kt;
670  
671  		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
672  		swap ^= kt;
673  		cswap(x2, x3, swap);
674  		cswap(z2, z3, swap);
675  		swap = kt;
676  
677  		/* obsolete
678  		print_int("x2", x2);
679  		print_int("z2", z2);
680  		print_int("x3", x3);
681  		print_int("z3", z3);
682  		*/
683  
684  		f255_add(a, x2, z2);
685  		f255_square(aa, a);
686  		f255_sub(b, x2, z2);
687  		f255_square(bb, b);
688  		f255_sub(e, aa, bb);
689  		f255_add(c, x3, z3);
690  		f255_sub(d, x3, z3);
691  		f255_mul(da, d, a);
692  		f255_mul(cb, c, b);
693  
694  		/* obsolete
695  		print_int("a ", a);
696  		print_int("aa", aa);
697  		print_int("b ", b);
698  		print_int("bb", bb);
699  		print_int("e ", e);
700  		print_int("c ", c);
701  		print_int("d ", d);
702  		print_int("da", da);
703  		print_int("cb", cb);
704  		*/
705  
706  		f255_add(x3, da, cb);
707  		f255_square(x3, x3);
708  		f255_sub(z3, da, cb);
709  		f255_square(z3, z3);
710  		f255_mul(z3, z3, x1);
711  		f255_mul(x2, aa, bb);
712  		f255_mul_a24(z2, e);
713  		f255_add(z2, z2, aa);
714  		f255_mul(z2, e, z2);
715  
716  		/* obsolete
717  		print_int("x2", x2);
718  		print_int("z2", z2);
719  		print_int("x3", x3);
720  		print_int("z3", z3);
721  		*/
722  	}
723  	cswap(x2, x3, swap);
724  	cswap(z2, z3, swap);
725  
726  	/*
727  	 * Inverse z2 with a modular exponentiation. This is a simple
728  	 * square-and-multiply algorithm; we mutualise most non-squarings
729  	 * since the exponent contains almost only ones.
730  	 */
731  	memcpy(a, z2, sizeof z2);
732  	for (i = 0; i < 15; i ++) {
733  		f255_square(a, a);
734  		f255_mul(a, a, z2);
735  	}
736  	memcpy(b, a, sizeof a);
737  	for (i = 0; i < 14; i ++) {
738  		int j;
739  
740  		for (j = 0; j < 16; j ++) {
741  			f255_square(b, b);
742  		}
743  		f255_mul(b, b, a);
744  	}
745  	for (i = 14; i >= 0; i --) {
746  		f255_square(b, b);
747  		if ((0xFFEB >> i) & 1) {
748  			f255_mul(b, z2, b);
749  		}
750  	}
751  	f255_mul(x2, x2, b);
752  	reduce_final_f255(x2);
753  	le30_to_le8(G, 32, x2);
754  	return 1;
755  }
756  
757  static size_t
758  api_mulgen(unsigned char *R,
759  	const unsigned char *x, size_t xlen, int curve)
760  {
761  	const unsigned char *G;
762  	size_t Glen;
763  
764  	G = api_generator(curve, &Glen);
765  	memcpy(R, G, Glen);
766  	api_mul(R, Glen, x, xlen, curve);
767  	return Glen;
768  }
769  
770  static uint32_t
771  api_muladd(unsigned char *A, const unsigned char *B, size_t len,
772  	const unsigned char *x, size_t xlen,
773  	const unsigned char *y, size_t ylen, int curve)
774  {
775  	/*
776  	 * We don't implement this method, since it is used for ECDSA
777  	 * only, and there is no ECDSA over Curve25519 (which instead
778  	 * uses EdDSA).
779  	 */
780  	(void)A;
781  	(void)B;
782  	(void)len;
783  	(void)x;
784  	(void)xlen;
785  	(void)y;
786  	(void)ylen;
787  	(void)curve;
788  	return 0;
789  }
790  
791  /* see bearssl_ec.h */
792  const br_ec_impl br_ec_c25519_m31 = {
793  	(uint32_t)0x20000000,
794  	&api_generator,
795  	&api_order,
796  	&api_xoff,
797  	&api_mul,
798  	&api_mulgen,
799  	&api_muladd
800  };