/ src / int / i31_montmul.c
i31_montmul.c
  1  /*
  2   * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
  3   *
  4   * Permission is hereby granted, free of charge, to any person obtaining 
  5   * a copy of this software and associated documentation files (the
  6   * "Software"), to deal in the Software without restriction, including
  7   * without limitation the rights to use, copy, modify, merge, publish,
  8   * distribute, sublicense, and/or sell copies of the Software, and to
  9   * permit persons to whom the Software is furnished to do so, subject to
 10   * the following conditions:
 11   *
 12   * The above copyright notice and this permission notice shall be 
 13   * included in all copies or substantial portions of the Software.
 14   *
 15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 18   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 19   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 20   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22   * SOFTWARE.
 23   */
 24  
 25  #include "inner.h"
 26  
 27  /* see inner.h */
 28  void
 29  br_i31_montymul(uint32_t *d, const uint32_t *x, const uint32_t *y,
 30  	const uint32_t *m, uint32_t m0i)
 31  {
 32  	/*
 33  	 * Each outer loop iteration computes:
 34  	 *   d <- (d + xu*y + f*m) / 2^31
 35  	 * We have xu <= 2^31-1 and f <= 2^31-1.
 36  	 * Thus, if d <= 2*m-1 on input, then:
 37  	 *   2*m-1 + 2*(2^31-1)*m <= (2^32)*m-1
 38  	 * and the new d value is less than 2*m.
 39  	 *
 40  	 * We represent d over 31-bit words, with an extra word 'dh'
 41  	 * which can thus be only 0 or 1.
 42  	 */
 43  	size_t len, len4, u, v;
 44  	uint32_t dh;
 45  
 46  	len = (m[0] + 31) >> 5;
 47  	len4 = len & ~(size_t)3;
 48  	br_i31_zero(d, m[0]);
 49  	dh = 0;
 50  	for (u = 0; u < len; u ++) {
 51  		/*
 52  		 * The carry for each operation fits on 32 bits:
 53  		 *   d[v+1] <= 2^31-1
 54  		 *   xu*y[v+1] <= (2^31-1)*(2^31-1)
 55  		 *   f*m[v+1] <= (2^31-1)*(2^31-1)
 56  		 *   r <= 2^32-1
 57  		 *   (2^31-1) + 2*(2^31-1)*(2^31-1) + (2^32-1) = 2^63 - 2^31
 58  		 * After division by 2^31, the new r is then at most 2^32-1
 59  		 *
 60  		 * Using a 32-bit carry has performance benefits on 32-bit
 61  		 * systems; however, on 64-bit architectures, we prefer to
 62  		 * keep the carry (r) in a 64-bit register, thus avoiding some
 63  		 * "clear high bits" operations.
 64  		 */
 65  		uint32_t f, xu;
 66  #if BR_64
 67  		uint64_t r;
 68  #else
 69  		uint32_t r;
 70  #endif
 71  
 72  		xu = x[u + 1];
 73  		f = MUL31_lo((d[1] + MUL31_lo(x[u + 1], y[1])), m0i);
 74  
 75  		r = 0;
 76  		for (v = 0; v < len4; v += 4) {
 77  			uint64_t z;
 78  
 79  			z = (uint64_t)d[v + 1] + MUL31(xu, y[v + 1])
 80  				+ MUL31(f, m[v + 1]) + r;
 81  			r = z >> 31;
 82  			d[v + 0] = (uint32_t)z & 0x7FFFFFFF;
 83  			z = (uint64_t)d[v + 2] + MUL31(xu, y[v + 2])
 84  				+ MUL31(f, m[v + 2]) + r;
 85  			r = z >> 31;
 86  			d[v + 1] = (uint32_t)z & 0x7FFFFFFF;
 87  			z = (uint64_t)d[v + 3] + MUL31(xu, y[v + 3])
 88  				+ MUL31(f, m[v + 3]) + r;
 89  			r = z >> 31;
 90  			d[v + 2] = (uint32_t)z & 0x7FFFFFFF;
 91  			z = (uint64_t)d[v + 4] + MUL31(xu, y[v + 4])
 92  				+ MUL31(f, m[v + 4]) + r;
 93  			r = z >> 31;
 94  			d[v + 3] = (uint32_t)z & 0x7FFFFFFF;
 95  		}
 96  		for (; v < len; v ++) {
 97  			uint64_t z;
 98  
 99  			z = (uint64_t)d[v + 1] + MUL31(xu, y[v + 1])
100  				+ MUL31(f, m[v + 1]) + r;
101  			r = z >> 31;
102  			d[v] = (uint32_t)z & 0x7FFFFFFF;
103  		}
104  
105  		/*
106  		 * Since the new dh can only be 0 or 1, the addition of
107  		 * the old dh with the carry MUST fit on 32 bits, and
108  		 * thus can be done into dh itself.
109  		 */
110  		dh += r;
111  		d[len] = dh & 0x7FFFFFFF;
112  		dh >>= 31;
113  	}
114  
115  	/*
116  	 * We must write back the bit length because it was overwritten in
117  	 * the loop (not overwriting it would require a test in the loop,
118  	 * which would yield bigger and slower code).
119  	 */
120  	d[0] = m[0];
121  
122  	/*
123  	 * d[] may still be greater than m[] at that point; notably, the
124  	 * 'dh' word may be non-zero.
125  	 */
126  	br_i31_sub(d, m, NEQ(dh, 0) | NOT(br_i31_sub(d, m, 0)));
127  }