/ src / int / i15_moddiv.c
i15_moddiv.c
  1  /*
  2   * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
  3   *
  4   * Permission is hereby granted, free of charge, to any person obtaining 
  5   * a copy of this software and associated documentation files (the
  6   * "Software"), to deal in the Software without restriction, including
  7   * without limitation the rights to use, copy, modify, merge, publish,
  8   * distribute, sublicense, and/or sell copies of the Software, and to
  9   * permit persons to whom the Software is furnished to do so, subject to
 10   * the following conditions:
 11   *
 12   * The above copyright notice and this permission notice shall be 
 13   * included in all copies or substantial portions of the Software.
 14   *
 15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 18   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 19   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 20   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22   * SOFTWARE.
 23   */
 24  
 25  #include "inner.h"
 26  
 27  /*
 28   * In this file, we handle big integers with a custom format, i.e.
 29   * without the usual one-word header. Value is split into 15-bit words,
 30   * each stored in a 16-bit slot (top bit is zero) in little-endian
 31   * order. The length (in words) is provided explicitly. In some cases,
 32   * the value can be negative (using two's complement representation). In
 33   * some cases, the top word is allowed to have a 16th bit.
 34   */
 35  
 36  /*
 37   * Negate big integer conditionally. The value consists of 'len' words,
 38   * with 15 bits in each word (the top bit of each word should be 0,
 39   * except possibly for the last word). If 'ctl' is 1, the negation is
 40   * computed; otherwise, if 'ctl' is 0, then the value is unchanged.
 41   */
 42  static void
 43  cond_negate(uint16_t *a, size_t len, uint32_t ctl)
 44  {
 45  	size_t k;
 46  	uint32_t cc, xm;
 47  
 48  	cc = ctl;
 49  	xm = 0x7FFF & -ctl;
 50  	for (k = 0; k < len; k ++) {
 51  		uint32_t aw;
 52  
 53  		aw = a[k];
 54  		aw = (aw ^ xm) + cc;
 55  		a[k] = aw & 0x7FFF;
 56  		cc = (aw >> 15) & 1;
 57  	}
 58  }
 59  
 60  /*
 61   * Finish modular reduction. Rules on input parameters:
 62   *
 63   *   if neg = 1, then -m <= a < 0
 64   *   if neg = 0, then 0 <= a < 2*m
 65   *
 66   * If neg = 0, then the top word of a[] may use 16 bits.
 67   *
 68   * Also, modulus m must be odd.
 69   */
 70  static void
 71  finish_mod(uint16_t *a, size_t len, const uint16_t *m, uint32_t neg)
 72  {
 73  	size_t k;
 74  	uint32_t cc, xm, ym;
 75  
 76  	/*
 77  	 * First pass: compare a (assumed nonnegative) with m.
 78  	 */
 79  	cc = 0;
 80  	for (k = 0; k < len; k ++) {
 81  		uint32_t aw, mw;
 82  
 83  		aw = a[k];
 84  		mw = m[k];
 85  		cc = (aw - mw - cc) >> 31;
 86  	}
 87  
 88  	/*
 89  	 * At this point:
 90  	 *   if neg = 1, then we must add m (regardless of cc)
 91  	 *   if neg = 0 and cc = 0, then we must subtract m
 92  	 *   if neg = 0 and cc = 1, then we must do nothing
 93  	 */
 94  	xm = 0x7FFF & -neg;
 95  	ym = -(neg | (1 - cc));
 96  	cc = neg;
 97  	for (k = 0; k < len; k ++) {
 98  		uint32_t aw, mw;
 99  
100  		aw = a[k];
101  		mw = (m[k] ^ xm) & ym;
102  		aw = aw - mw - cc;
103  		a[k] = aw & 0x7FFF;
104  		cc = aw >> 31;
105  	}
106  }
107  
108  /*
109   * Compute:
110   *   a <- (a*pa+b*pb)/(2^15)
111   *   b <- (a*qa+b*qb)/(2^15)
112   * The division is assumed to be exact (i.e. the low word is dropped).
113   * If the final a is negative, then it is negated. Similarly for b.
114   * Returned value is the combination of two bits:
115   *   bit 0: 1 if a had to be negated, 0 otherwise
116   *   bit 1: 1 if b had to be negated, 0 otherwise
117   *
118   * Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
119   * Source integers a and b must be nonnegative; top word is not allowed
120   * to contain an extra 16th bit.
121   */
122  static uint32_t
123  co_reduce(uint16_t *a, uint16_t *b, size_t len,
124  	int32_t pa, int32_t pb, int32_t qa, int32_t qb)
125  {
126  	size_t k;
127  	int32_t cca, ccb;
128  	uint32_t nega, negb;
129  
130  	cca = 0;
131  	ccb = 0;
132  	for (k = 0; k < len; k ++) {
133  		uint32_t wa, wb, za, zb;
134  		uint16_t tta, ttb;
135  
136  		/*
137  		 * Since:
138  		 *   |pa| <= 2^15
139  		 *   |pb| <= 2^15
140  		 *   0 <= wa <= 2^15 - 1
141  		 *   0 <= wb <= 2^15 - 1
142  		 *   |cca| <= 2^16 - 1
143  		 * Then:
144  		 *   |za| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1
145  		 *
146  		 * Thus, the new value of cca is such that |cca| <= 2^16 - 1.
147  		 * The same applies to ccb.
148  		 */
149  		wa = a[k];
150  		wb = b[k];
151  		za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca;
152  		zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb;
153  		if (k > 0) {
154  			a[k - 1] = za & 0x7FFF;
155  			b[k - 1] = zb & 0x7FFF;
156  		}
157  		tta = za >> 15;
158  		ttb = zb >> 15;
159  		cca = *(int16_t *)&tta;
160  		ccb = *(int16_t *)&ttb;
161  	}
162  	a[len - 1] = (uint16_t)cca;
163  	b[len - 1] = (uint16_t)ccb;
164  	nega = (uint32_t)cca >> 31;
165  	negb = (uint32_t)ccb >> 31;
166  	cond_negate(a, len, nega);
167  	cond_negate(b, len, negb);
168  	return nega | (negb << 1);
169  }
170  
171  /*
172   * Compute:
173   *   a <- (a*pa+b*pb)/(2^15) mod m
174   *   b <- (a*qa+b*qb)/(2^15) mod m
175   *
176   * m0i is equal to -1/m[0] mod 2^15.
177   *
178   * Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
179   * Source integers a and b must be nonnegative; top word is not allowed
180   * to contain an extra 16th bit.
181   */
182  static void
183  co_reduce_mod(uint16_t *a, uint16_t *b, size_t len,
184  	int32_t pa, int32_t pb, int32_t qa, int32_t qb,
185  	const uint16_t *m, uint16_t m0i)
186  {
187  	size_t k;
188  	int32_t cca, ccb, fa, fb;
189  
190  	cca = 0;
191  	ccb = 0;
192  	fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF;
193  	fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF;
194  	for (k = 0; k < len; k ++) {
195  		uint32_t wa, wb, za, zb;
196  		uint32_t tta, ttb;
197  
198  		/*
199  		 * In this loop, carries 'cca' and 'ccb' always fit on
200  		 * 17 bits (in absolute value).
201  		 */
202  		wa = a[k];
203  		wb = b[k];
204  		za = wa * (uint32_t)pa + wb * (uint32_t)pb
205  			+ m[k] * (uint32_t)fa + (uint32_t)cca;
206  		zb = wa * (uint32_t)qa + wb * (uint32_t)qb
207  			+ m[k] * (uint32_t)fb + (uint32_t)ccb;
208  		if (k > 0) {
209  			a[k - 1] = za & 0x7FFF;
210  			b[k - 1] = zb & 0x7FFF;
211  		}
212  
213  		/*
214  		 * The XOR-and-sub construction below does an arithmetic
215  		 * right shift in a portable way (technically, right-shifting
216  		 * a negative signed value is implementation-defined in C).
217  		 */
218  #define M   ((uint32_t)1 << 16)
219  		tta = za >> 15;
220  		ttb = zb >> 15;
221  		tta = (tta ^ M) - M;
222  		ttb = (ttb ^ M) - M;
223  		cca = *(int32_t *)&tta;
224  		ccb = *(int32_t *)&ttb;
225  #undef M
226  	}
227  	a[len - 1] = (uint32_t)cca;
228  	b[len - 1] = (uint32_t)ccb;
229  
230  	/*
231  	 * At this point:
232  	 *   -m <= a < 2*m
233  	 *   -m <= b < 2*m
234  	 * (this is a case of Montgomery reduction)
235  	 * The top word of 'a' and 'b' may have a 16-th bit set.
236  	 * We may have to add or subtract the modulus.
237  	 */
238  	finish_mod(a, len, m, (uint32_t)cca >> 31);
239  	finish_mod(b, len, m, (uint32_t)ccb >> 31);
240  }
241  
242  /* see inner.h */
243  uint32_t
244  br_i15_moddiv(uint16_t *x, const uint16_t *y, const uint16_t *m, uint16_t m0i,
245  	uint16_t *t)
246  {
247  	/*
248  	 * Algorithm is an extended binary GCD. We maintain four values
249  	 * a, b, u and v, with the following invariants:
250  	 *
251  	 *   a * x = y * u mod m
252  	 *   b * x = y * v mod m
253  	 *
254  	 * Starting values are:
255  	 *
256  	 *   a = y
257  	 *   b = m
258  	 *   u = x
259  	 *   v = 0
260  	 *
261  	 * The formal definition of the algorithm is a sequence of steps:
262  	 *
263  	 *   - If a is even, then a <- a/2 and u <- u/2 mod m.
264  	 *   - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
265  	 *   - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
266  	 *   - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
267  	 *
268  	 * Algorithm stops when a = b. At that point, they both are equal
269  	 * to GCD(y,m); the modular division succeeds if that value is 1.
270  	 * The result of the modular division is then u (or v: both are
271  	 * equal at that point).
272  	 *
273  	 * Each step makes either a or b shrink by at least one bit; hence,
274  	 * if m has bit length k bits, then 2k-2 steps are sufficient.
275  	 *
276  	 *
277  	 * Though complexity is quadratic in the size of m, the bit-by-bit
278  	 * processing is not very efficient. We can speed up processing by
279  	 * remarking that the decisions are taken based only on observation
280  	 * of the top and low bits of a and b.
281  	 *
282  	 * In the loop below, at each iteration, we use the two top words
283  	 * of a and b, and the low words of a and b, to compute reduction
284  	 * parameters pa, pb, qa and qb such that the new values for a
285  	 * and b are:
286  	 *
287  	 *   a' = (a*pa + b*pb) / (2^15)
288  	 *   b' = (a*qa + b*qb) / (2^15)
289  	 *
290  	 * the division being exact.
291  	 *
292  	 * Since the choices are based on the top words, they may be slightly
293  	 * off, requiring an optional correction: if a' < 0, then we replace
294  	 * pa with -pa, and pb with -pb. The total length of a and b is
295  	 * thus reduced by at least 14 bits at each iteration.
296  	 *
297  	 * The stopping conditions are still the same, though: when a
298  	 * and b become equal, they must be both odd (since m is odd,
299  	 * the GCD cannot be even), therefore the next operation is a
300  	 * subtraction, and one of the values becomes 0. At that point,
301  	 * nothing else happens, i.e. one value is stuck at 0, and the
302  	 * other one is the GCD.
303  	 */
304  	size_t len, k;
305  	uint16_t *a, *b, *u, *v;
306  	uint32_t num, r;
307  
308  	len = (m[0] + 15) >> 4;
309  	a = t;
310  	b = a + len;
311  	u = x + 1;
312  	v = b + len;
313  	memcpy(a, y + 1, len * sizeof *y);
314  	memcpy(b, m + 1, len * sizeof *m);
315  	memset(v, 0, len * sizeof *v);
316  
317  	/*
318  	 * Loop below ensures that a and b are reduced by some bits each,
319  	 * for a total of at least 14 bits.
320  	 */
321  	for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) {
322  		size_t j;
323  		uint32_t c0, c1;
324  		uint32_t a0, a1, b0, b1;
325  		uint32_t a_hi, b_hi, a_lo, b_lo;
326  		int32_t pa, pb, qa, qb;
327  		int i;
328  
329  		/*
330  		 * Extract top words of a and b. If j is the highest
331  		 * index >= 1 such that a[j] != 0 or b[j] != 0, then we want
332  		 * (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1].
333  		 * If a and b are down to one word each, then we use a[0]
334  		 * and b[0].
335  		 */
336  		c0 = (uint32_t)-1;
337  		c1 = (uint32_t)-1;
338  		a0 = 0;
339  		a1 = 0;
340  		b0 = 0;
341  		b1 = 0;
342  		j = len;
343  		while (j -- > 0) {
344  			uint32_t aw, bw;
345  
346  			aw = a[j];
347  			bw = b[j];
348  			a0 ^= (a0 ^ aw) & c0;
349  			a1 ^= (a1 ^ aw) & c1;
350  			b0 ^= (b0 ^ bw) & c0;
351  			b1 ^= (b1 ^ bw) & c1;
352  			c1 = c0;
353  			c0 &= (((aw | bw) + 0xFFFF) >> 16) - (uint32_t)1;
354  		}
355  
356  		/*
357  		 * If c1 = 0, then we grabbed two words for a and b.
358  		 * If c1 != 0 but c0 = 0, then we grabbed one word. It
359  		 * is not possible that c1 != 0 and c0 != 0, because that
360  		 * would mean that both integers are zero.
361  		 */
362  		a1 |= a0 & c1;
363  		a0 &= ~c1;
364  		b1 |= b0 & c1;
365  		b0 &= ~c1;
366  		a_hi = (a0 << 15) + a1;
367  		b_hi = (b0 << 15) + b1;
368  		a_lo = a[0];
369  		b_lo = b[0];
370  
371  		/*
372  		 * Compute reduction factors:
373  		 *
374  		 *   a' = a*pa + b*pb
375  		 *   b' = a*qa + b*qb
376  		 *
377  		 * such that a' and b' are both multiple of 2^15, but are
378  		 * only marginally larger than a and b.
379  		 */
380  		pa = 1;
381  		pb = 0;
382  		qa = 0;
383  		qb = 1;
384  		for (i = 0; i < 15; i ++) {
385  			/*
386  			 * At each iteration:
387  			 *
388  			 *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
389  			 *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
390  			 *   a <- a/2 if: a is even
391  			 *   b <- b/2 if: a is odd, b is even
392  			 *
393  			 * We multiply a_lo and b_lo by 2 at each
394  			 * iteration, thus a division by 2 really is a
395  			 * non-multiplication by 2.
396  			 */
397  			uint32_t r, oa, ob, cAB, cBA, cA;
398  
399  			/*
400  			 * cAB = 1 if b must be subtracted from a
401  			 * cBA = 1 if a must be subtracted from b
402  			 * cA = 1 if a is divided by 2, 0 otherwise
403  			 *
404  			 * Rules:
405  			 *
406  			 *   cAB and cBA cannot be both 1.
407  			 *   if a is not divided by 2, b is.
408  			 */
409  			r = GT(a_hi, b_hi);
410  			oa = (a_lo >> i) & 1;
411  			ob = (b_lo >> i) & 1;
412  			cAB = oa & ob & r;
413  			cBA = oa & ob & NOT(r);
414  			cA = cAB | NOT(oa);
415  
416  			/*
417  			 * Conditional subtractions.
418  			 */
419  			a_lo -= b_lo & -cAB;
420  			a_hi -= b_hi & -cAB;
421  			pa -= qa & -(int32_t)cAB;
422  			pb -= qb & -(int32_t)cAB;
423  			b_lo -= a_lo & -cBA;
424  			b_hi -= a_hi & -cBA;
425  			qa -= pa & -(int32_t)cBA;
426  			qb -= pb & -(int32_t)cBA;
427  
428  			/*
429  			 * Shifting.
430  			 */
431  			a_lo += a_lo & (cA - 1);
432  			pa += pa & ((int32_t)cA - 1);
433  			pb += pb & ((int32_t)cA - 1);
434  			a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA;
435  			b_lo += b_lo & -cA;
436  			qa += qa & -(int32_t)cA;
437  			qb += qb & -(int32_t)cA;
438  			b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1);
439  		}
440  
441  		/*
442  		 * Replace a and b with new values a' and b'.
443  		 */
444  		r = co_reduce(a, b, len, pa, pb, qa, qb);
445  		pa -= pa * ((r & 1) << 1);
446  		pb -= pb * ((r & 1) << 1);
447  		qa -= qa * (r & 2);
448  		qb -= qb * (r & 2);
449  		co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
450  	}
451  
452  	/*
453  	 * Now one of the arrays should be 0, and the other contains
454  	 * the GCD. If a is 0, then u is 0 as well, and v contains
455  	 * the division result.
456  	 * Result is correct if and only if GCD is 1.
457  	 */
458  	r = (a[0] | b[0]) ^ 1;
459  	u[0] |= v[0];
460  	for (k = 1; k < len; k ++) {
461  		r |= a[k] | b[k];
462  		u[k] |= v[k];
463  	}
464  	return EQ0(r);
465  }