/ src / int / i15_montmul.c
i15_montmul.c
  1  /*
  2   * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
  3   *
  4   * Permission is hereby granted, free of charge, to any person obtaining 
  5   * a copy of this software and associated documentation files (the
  6   * "Software"), to deal in the Software without restriction, including
  7   * without limitation the rights to use, copy, modify, merge, publish,
  8   * distribute, sublicense, and/or sell copies of the Software, and to
  9   * permit persons to whom the Software is furnished to do so, subject to
 10   * the following conditions:
 11   *
 12   * The above copyright notice and this permission notice shall be 
 13   * included in all copies or substantial portions of the Software.
 14   *
 15   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 16   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 17   * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 18   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 19   * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 20   * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 21   * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22   * SOFTWARE.
 23   */
 24  
 25  #include "inner.h"
 26  
 27  /* see inner.h */
 28  void
 29  br_i15_montymul(uint16_t *d, const uint16_t *x, const uint16_t *y,
 30  	const uint16_t *m, uint16_t m0i)
 31  {
 32  	size_t len, len4, u, v;
 33  	uint32_t dh;
 34  
 35  	len = (m[0] + 15) >> 4;
 36  	len4 = len & ~(size_t)3;
 37  	br_i15_zero(d, m[0]);
 38  	dh = 0;
 39  	for (u = 0; u < len; u ++) {
 40  		uint32_t f, xu, r, zh;
 41  
 42  		xu = x[u + 1];
 43  		f = MUL15((d[1] + MUL15(x[u + 1], y[1])) & 0x7FFF, m0i)
 44  			& 0x7FFF;
 45  #if BR_ARMEL_CORTEXM_GCC
 46  		if (len4 != 0) {
 47  			uint16_t *limit;
 48  
 49  			limit = d + len4;
 50  			asm volatile (
 51  "\n\
 52  	@ carry: r=r2                                              \n\
 53  	@ multipliers: xu=r3 f=r4                                  \n\
 54  	@ base registers: d+v=r5 y+v=r6 m+v=r7                     \n\
 55  	@ r8 contains 0x7FFF                                       \n\
 56  	@ r9 contains d+len4                                       \n\
 57  	ldr	r0, %[limit]                                       \n\
 58  	ldr	r3, %[xu]                                          \n\
 59  	mov	r9, r0                                             \n\
 60  	ldr	r4, %[f]                                           \n\
 61  	eor	r2, r2                                             \n\
 62  	ldr	r5, %[d]                                           \n\
 63  	sub	r1, r2, #1                                         \n\
 64  	ldr	r6, %[y]                                           \n\
 65  	lsr	r1, r1, #17                                        \n\
 66  	ldr	r7, %[m]                                           \n\
 67  	mov	r8, r1                                             \n\
 68  loop%=:                                                            \n\
 69  	ldrh	r0, [r6, #2]                                       \n\
 70  	ldrh	r1, [r7, #2]                                       \n\
 71  	mul	r0, r3                                             \n\
 72  	mul	r1, r4                                             \n\
 73  	add	r2, r0, r2                                         \n\
 74  	ldrh	r0, [r5, #2]                                       \n\
 75  	add	r2, r1, r2                                         \n\
 76  	mov	r1, r8                                             \n\
 77  	add	r2, r0, r2                                         \n\
 78  	and	r1, r2                                             \n\
 79  	lsr	r2, r2, #15                                        \n\
 80  	strh	r1, [r5, #0]                                       \n\
 81  		                                                   \n\
 82  	ldrh	r0, [r6, #4]                                       \n\
 83  	ldrh	r1, [r7, #4]                                       \n\
 84  	mul	r0, r3                                             \n\
 85  	mul	r1, r4                                             \n\
 86  	add	r2, r0, r2                                         \n\
 87  	ldrh	r0, [r5, #4]                                       \n\
 88  	add	r2, r1, r2                                         \n\
 89  	mov	r1, r8                                             \n\
 90  	add	r2, r0, r2                                         \n\
 91  	and	r1, r2                                             \n\
 92  	lsr	r2, r2, #15                                        \n\
 93  	strh	r1, [r5, #2]                                       \n\
 94  		                                                   \n\
 95  	ldrh	r0, [r6, #6]                                       \n\
 96  	ldrh	r1, [r7, #6]                                       \n\
 97  	mul	r0, r3                                             \n\
 98  	mul	r1, r4                                             \n\
 99  	add	r2, r0, r2                                         \n\
100  	ldrh	r0, [r5, #6]                                       \n\
101  	add	r2, r1, r2                                         \n\
102  	mov	r1, r8                                             \n\
103  	add	r2, r0, r2                                         \n\
104  	and	r1, r2                                             \n\
105  	lsr	r2, r2, #15                                        \n\
106  	strh	r1, [r5, #4]                                       \n\
107  		                                                   \n\
108  	ldrh	r0, [r6, #8]                                       \n\
109  	ldrh	r1, [r7, #8]                                       \n\
110  	mul	r0, r3                                             \n\
111  	mul	r1, r4                                             \n\
112  	add	r2, r0, r2                                         \n\
113  	ldrh	r0, [r5, #8]                                       \n\
114  	add	r2, r1, r2                                         \n\
115  	mov	r1, r8                                             \n\
116  	add	r2, r0, r2                                         \n\
117  	and	r1, r2                                             \n\
118  	lsr	r2, r2, #15                                        \n\
119  	strh	r1, [r5, #6]                                       \n\
120  		                                                   \n\
121  	add	r5, r5, #8                                         \n\
122  	add	r6, r6, #8                                         \n\
123  	add	r7, r7, #8                                         \n\
124  	cmp	r5, r9                                             \n\
125  	bne	loop%=                                             \n\
126  		                                                   \n\
127  	str	r2, %[carry]                                       \n\
128  "
129  : [carry] "=m" (r)
130  : [xu] "m" (xu), [f] "m" (f), [d] "m" (d), [y] "m" (y),
131  	[m] "m" (m), [limit] "m" (limit)
132  : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9" );
133  		} else {
134  			r = 0;
135  		}
136  		v = len4;
137  #else
138  		r = 0;
139  		for (v = 0; v < len4; v += 4) {
140  			uint32_t z;
141  
142  			z = d[v + 1] + MUL15(xu, y[v + 1])
143  				+ MUL15(f, m[v + 1]) + r;
144  			r = z >> 15;
145  			d[v + 0] = z & 0x7FFF;
146  			z = d[v + 2] + MUL15(xu, y[v + 2])
147  				+ MUL15(f, m[v + 2]) + r;
148  			r = z >> 15;
149  			d[v + 1] = z & 0x7FFF;
150  			z = d[v + 3] + MUL15(xu, y[v + 3])
151  				+ MUL15(f, m[v + 3]) + r;
152  			r = z >> 15;
153  			d[v + 2] = z & 0x7FFF;
154  			z = d[v + 4] + MUL15(xu, y[v + 4])
155  				+ MUL15(f, m[v + 4]) + r;
156  			r = z >> 15;
157  			d[v + 3] = z & 0x7FFF;
158  		}
159  #endif
160  		for (; v < len; v ++) {
161  			uint32_t z;
162  
163  			z = d[v + 1] + MUL15(xu, y[v + 1])
164  				+ MUL15(f, m[v + 1]) + r;
165  			r = z >> 15;
166  			d[v + 0] = z & 0x7FFF;
167  		}
168  
169  		zh = dh + r;
170  		d[len] = zh & 0x7FFF;
171  		dh = zh >> 15;
172  	}
173  
174  	/*
175  	 * Restore the bit length (it was overwritten in the loop above).
176  	 */
177  	d[0] = m[0];
178  
179  	/*
180  	 * d[] may be greater than m[], but it is still lower than twice
181  	 * the modulus.
182  	 */
183  	br_i15_sub(d, m, NEQ(dh, 0) | NOT(br_i15_sub(d, m, 0)));
184  }