/ src / rsa / rsa_i31_privexp.c
rsa_i31_privexp.c
  1  /*
  2   * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
  3   *
  4   * Permission is hereby granted, free of charge, to any person obtaining 
  5   * a copy of this software and associated documentation files (the
  6   * "Software"), to deal in the Software without restriction, including
  7   * without limitation the rights to use, copy, modify, merge, publish,
  8   * distribute, sublicense, and/or sell copies of the Software, and to
  9   * permit persons to whom the Software is furnished to do so, subject to
 10   * the following conditions:
 11   *
 12   * The above copyright notice and this permission notice shall be 
 13   * included in all copies or substantial portions of the Software.
 14   *
 15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 18   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 19   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 20   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22   * SOFTWARE.
 23   */
 24  
 25  #include "inner.h"
 26  
 27  /* see bearssl_rsa.h */
 28  size_t
 29  br_rsa_i31_compute_privexp(void *d,
 30  	const br_rsa_private_key *sk, uint32_t e)
 31  {
 32  	/*
 33  	 * We want to invert e modulo phi = (p-1)(q-1). This first
 34  	 * requires computing phi, which is easy since we have the factors
 35  	 * p and q in the private key structure.
 36  	 *
 37  	 * Since p = 3 mod 4 and q = 3 mod 4, phi/4 is an odd integer.
 38  	 * We could invert e modulo phi/4 then patch the result to
 39  	 * modulo phi, but this would involve assembling three modulus-wide
 40  	 * values (phi/4, 1 and e) and calling moddiv, that requires
 41  	 * three more temporaries, for a total of six big integers, or
 42  	 * slightly more than 3 kB of stack space for RSA-4096. This
 43  	 * exceeds our stack requirements.
 44  	 *
 45  	 * Instead, we first use one step of the extended GCD:
 46  	 *
 47  	 *   - We compute phi = k*e + r  (Euclidean division of phi by e).
 48  	 *     If public exponent e is correct, then r != 0 (e must be
 49  	 *     invertible modulo phi). We also have k != 0 since we
 50  	 *     enforce non-ridiculously-small factors.
 51  	 *
 52  	 *   - We find small u, v such that u*e - v*r = 1  (using a
 53  	 *     binary GCD; we can arrange for u < r and v < e, i.e. all
 54  	 *     values fit on 32 bits).
 55  	 *
 56  	 *   - Solution is: d = u + v*k
 57  	 *     This last computation is exact: since u < r and v < e,
 58  	 *     the above implies d < r + e*((phi-r)/e) = phi
 59  	 */
 60  
 61  	uint32_t tmp[4 * ((BR_MAX_RSA_FACTOR + 30) / 31) + 12];
 62  	uint32_t *p, *q, *k, *m, *z, *phi;
 63  	const unsigned char *pbuf, *qbuf;
 64  	size_t plen, qlen, u, len, dlen;
 65  	uint32_t r, a, b, u0, v0, u1, v1, he, hr;
 66  	int i;
 67  
 68  	/*
 69  	 * Check that e is correct.
 70  	 */
 71  	if (e < 3 || (e & 1) == 0) {
 72  		return 0;
 73  	}
 74  
 75  	/*
 76  	 * Check lengths of p and q, and that they are both odd.
 77  	 */
 78  	pbuf = sk->p;
 79  	plen = sk->plen;
 80  	while (plen > 0 && *pbuf == 0) {
 81  		pbuf ++;
 82  		plen --;
 83  	}
 84  	if (plen < 5 || plen > (BR_MAX_RSA_FACTOR / 8)
 85  		|| (pbuf[plen - 1] & 1) != 1)
 86  	{
 87  		return 0;
 88  	}
 89  	qbuf = sk->q;
 90  	qlen = sk->qlen;
 91  	while (qlen > 0 && *qbuf == 0) {
 92  		qbuf ++;
 93  		qlen --;
 94  	}
 95  	if (qlen < 5 || qlen > (BR_MAX_RSA_FACTOR / 8)
 96  		|| (qbuf[qlen - 1] & 1) != 1)
 97  	{
 98  		return 0;
 99  	}
100  
101  	/*
102  	 * Output length is that of the modulus.
103  	 */
104  	dlen = (sk->n_bitlen + 7) >> 3;
105  	if (d == NULL) {
106  		return dlen;
107  	}
108  
109  	p = tmp;
110  	br_i31_decode(p, pbuf, plen);
111  	plen = (p[0] + 31) >> 5;
112  	q = p + 1 + plen;
113  	br_i31_decode(q, qbuf, qlen);
114  	qlen = (q[0] + 31) >> 5;
115  
116  	/*
117  	 * Compute phi = (p-1)*(q-1), then move it over p-1 and q-1 (that
118  	 * we do not need anymore). The mulacc function sets the announced
119  	 * bit length of t to be the sum of the announced bit lengths of
120  	 * p-1 and q-1, which is usually exact but may overshoot by one 1
121  	 * bit in some cases; we readjust it to its true length.
122  	 */
123  	p[1] --;
124  	q[1] --;
125  	phi = q + 1 + qlen;
126  	br_i31_zero(phi, p[0]);
127  	br_i31_mulacc(phi, p, q);
128  	len = (phi[0] + 31) >> 5;
129  	memmove(tmp, phi, (1 + len) * sizeof *phi);
130  	phi = tmp;
131  	phi[0] = br_i31_bit_length(phi + 1, len);
132  	len = (phi[0] + 31) >> 5;
133  
134  	/*
135  	 * Divide phi by public exponent e. The final remainder r must be
136  	 * non-zero (otherwise, the key is invalid). The quotient is k,
137  	 * which we write over phi, since we don't need phi after that.
138  	 */
139  	r = 0;
140  	for (u = len; u >= 1; u --) {
141  		/*
142  		 * Upon entry, r < e, and phi[u] < 2^31; hence,
143  		 * hi:lo < e*2^31. Thus, the produced word k[u]
144  		 * must be lower than 2^31, and the new remainder r
145  		 * is lower than e.
146  		 */
147  		uint32_t hi, lo;
148  
149  		hi = r >> 1;
150  		lo = (r << 31) + phi[u];
151  		phi[u] = br_divrem(hi, lo, e, &r);
152  	}
153  	if (r == 0) {
154  		return 0;
155  	}
156  	k = phi;
157  
158  	/*
159  	 * Compute u and v such that u*e - v*r = GCD(e,r). We use
160  	 * a binary GCD algorithm, with 6 extra integers a, b,
161  	 * u0, u1, v0 and v1. Initial values are:
162  	 *   a = e    u0 = 1   v0 = 0
163  	 *   b = r    u1 = r   v1 = e-1
164  	 * The following invariants are maintained:
165  	 *   a = u0*e - v0*r
166  	 *   b = u1*e - v1*r
167  	 *   0 < a <= e
168  	 *   0 < b <= r
169  	 *   0 <= u0 <= r
170  	 *   0 <= v0 <= e
171  	 *   0 <= u1 <= r
172  	 *   0 <= v1 <= e
173  	 *
174  	 * At each iteration, we reduce either a or b by one bit, and
175  	 * adjust u0, u1, v0 and v1 to maintain the invariants:
176  	 *  - if a is even, then a <- a/2
177  	 *  - otherwise, if b is even, then b <- b/2
178  	 *  - otherwise, if a > b, then a <- (a-b)/2
179  	 *  - otherwise, if b > a, then b <- (b-a)/2
180  	 * Algorithm stops when a = b. At that point, the common value
181  	 * is the GCD of e and r; it must be 1 (otherwise, the private
182  	 * key or public exponent is not valid). The (u0,v0) or (u1,v1)
183  	 * pairs are the solution we are looking for.
184  	 *
185  	 * Since either a or b is reduced by at least 1 bit at each
186  	 * iteration, 62 iterations are enough to reach the end
187  	 * condition.
188  	 *
189  	 * To maintain the invariants, we must compute the same operations
190  	 * on the u* and v* values that we do on a and b:
191  	 *  - When a is divided by 2, u0 and v0 must be divided by 2.
192  	 *  - When b is divided by 2, u1 and v1 must be divided by 2.
193  	 *  - When b is subtracted from a, u1 and v1 are subtracted from
194  	 *    u0 and v0, respectively.
195  	 *  - When a is subtracted from b, u0 and v0 are subtracted from
196  	 *    u1 and v1, respectively.
197  	 *
198  	 * However, we want to keep the u* and v* values in their proper
199  	 * ranges. The following remarks apply:
200  	 *
201  	 *  - When a is divided by 2, then a is even. Therefore:
202  	 *
203  	 *     * If r is odd, then u0 and v0 must have the same parity;
204  	 *       if they are both odd, then adding r to u0 and e to v0
205  	 *       makes them both even, and the division by 2 brings them
206  	 *       back to the proper range.
207  	 *
208  	 *     * If r is even, then u0 must be even; if v0 is odd, then
209  	 *       adding r to u0 and e to v0 makes them both even, and the
210  	 *       division by 2 brings them back to the proper range.
211  	 *
212  	 *    Thus, all we need to do is to look at the parity of v0,
213  	 *    and add (r,e) to (u0,v0) when v0 is odd. In order to avoid
214  	 *    a 32-bit overflow, we can add ((r+1)/2,(e/2)+1) after the
215  	 *    division (r+1 does not overflow since r < e; and (e/2)+1
216  	 *    is equal to (e+1)/2 since e is odd).
217  	 *
218  	 *  - When we subtract b from a, three cases may occur:
219  	 *
220  	 *     * u1 <= u0 and v1 <= v0: just do the subtractions
221  	 *
222  	 *     * u1 > u0 and v1 > v0: compute:
223  	 *         (u0, v0) <- (u0 + r - u1, v0 + e - v1)
224  	 *
225  	 *     * u1 <= u0 and v1 > v0: compute:
226  	 *         (u0, v0) <- (u0 + r - u1, v0 + e - v1)
227  	 *
228  	 *    The fourth case (u1 > u0 and v1 <= v0) is not possible
229  	 *    because it would contradict "b < a" (which is the reason
230  	 *    why we subtract b from a).
231  	 *
232  	 *    The tricky case is the third one: from the equations, it
233  	 *    seems that u0 may go out of range. However, the invariants
234  	 *    and ranges of other values imply that, in that case, the
235  	 *    new u0 does not actually exceed the range.
236  	 *
237  	 *    We can thus handle the subtraction by adding (r,e) based
238  	 *    solely on the comparison between v0 and v1.
239  	 */
240  	a = e;
241  	b = r;
242  	u0 = 1;
243  	v0 = 0;
244  	u1 = r;
245  	v1 = e - 1;
246  	hr = (r + 1) >> 1;
247  	he = (e >> 1) + 1;
248  	for (i = 0; i < 62; i ++) {
249  		uint32_t oa, ob, agtb, bgta;
250  		uint32_t sab, sba, da, db;
251  		uint32_t ctl;
252  
253  		oa = a & 1;                  /* 1 if a is odd */
254  		ob = b & 1;                  /* 1 if b is odd */
255  		agtb = GT(a, b);             /* 1 if a > b */
256  		bgta = GT(b, a);             /* 1 if b > a */
257  
258  		sab = oa & ob & agtb;        /* 1 if a <- a-b */
259  		sba = oa & ob & bgta;        /* 1 if b <- b-a */
260  
261  		/* a <- a-b, u0 <- u0-u1, v0 <- v0-v1 */
262  		ctl = GT(v1, v0);
263  		a -= b & -sab;
264  		u0 -= (u1 - (r & -ctl)) & -sab;
265  		v0 -= (v1 - (e & -ctl)) & -sab;
266  
267  		/* b <- b-a, u1 <- u1-u0 mod r, v1 <- v1-v0 mod e */
268  		ctl = GT(v0, v1);
269  		b -= a & -sba;
270  		u1 -= (u0 - (r & -ctl)) & -sba;
271  		v1 -= (v0 - (e & -ctl)) & -sba;
272  
273  		da = NOT(oa) | sab;          /* 1 if a <- a/2 */
274  		db = (oa & NOT(ob)) | sba;   /* 1 if b <- b/2 */
275  
276  		/* a <- a/2, u0 <- u0/2, v0 <- v0/2 */
277  		ctl = v0 & 1;
278  		a ^= (a ^ (a >> 1)) & -da;
279  		u0 ^= (u0 ^ ((u0 >> 1) + (hr & -ctl))) & -da;
280  		v0 ^= (v0 ^ ((v0 >> 1) + (he & -ctl))) & -da;
281  
282  		/* b <- b/2, u1 <- u1/2 mod r, v1 <- v1/2 mod e */
283  		ctl = v1 & 1;
284  		b ^= (b ^ (b >> 1)) & -db;
285  		u1 ^= (u1 ^ ((u1 >> 1) + (hr & -ctl))) & -db;
286  		v1 ^= (v1 ^ ((v1 >> 1) + (he & -ctl))) & -db;
287  	}
288  
289  	/*
290  	 * Check that the GCD is indeed 1. If not, then the key is invalid
291  	 * (and there's no harm in leaking that piece of information).
292  	 */
293  	if (a != 1) {
294  		return 0;
295  	}
296  
297  	/*
298  	 * Now we have u0*e - v0*r = 1. Let's compute the result as:
299  	 *   d = u0 + v0*k
300  	 * We still have k in the tmp[] array, and its announced bit
301  	 * length is that of phi.
302  	 */
303  	m = k + 1 + len;
304  	m[0] = (1 << 5) + 1;  /* bit length is 32 bits, encoded */
305  	m[1] = v0 & 0x7FFFFFFF;
306  	m[2] = v0 >> 31;
307  	z = m + 3;
308  	br_i31_zero(z, k[0]);
309  	z[1] = u0 & 0x7FFFFFFF;
310  	z[2] = u0 >> 31;
311  	br_i31_mulacc(z, k, m);
312  
313  	/*
314  	 * Encode the result.
315  	 */
316  	br_i31_encode(d, dlen, z);
317  	return dlen;
318  }