/ tests / rsqrt_test_fn.s
rsqrt_test_fn.s
  1  .global _rsqrt_inaccurate
  2  .global rsqrt_inaccurate
  3  .global _rsqrt_full
  4  .global rsqrt_full
  5  .global _rsqrt_full_gpr
  6  .global rsqrt_full_gpr
  7  .global _rsqrt_full_nb
  8  .global rsqrt_full_nb
  9  .global _rsqrt_full_nb2
 10  .global rsqrt_full_nb2
 11  .global _rsqrt_full_nb_gpr
 12  .global rsqrt_full_nb_gpr
 13  .global _rsqrt_newton
 14  .global rsqrt_newton
 15  .global _rsqrt_hack
 16  .global rsqrt_hack
 17  .global _rsqrt_fallback
 18  
 19  .text
 20  .intel_syntax noprefix
 21  
 22  .align 16
 23  min_pos_denorm:
 24  .long 0x00800000,0,0,0
 25  penultimate_bit:
 26  .long 0x00008000,0,0,0
 27  ultimate_bit:
 28  .long 0x00004000,0,0,0
 29  top_mask:
 30  .long 0xFFFF8000,0,0,0
 31  one:
 32  .long 0x3f800000,0,0,0
 33  half:
 34  .long 0x3f000000,0,0,0
 35  one_point_five:
 36  .long 0x3fc00000,0,0,0
 37  magic1:
 38  .long 0x60000000,0,0,0
 39  magic2:
 40  .long 0x3c000000,0,0,0
 41  magic3:
 42  .long 0x000047ff,0,0,0
 43  
 44  _rsqrt_inaccurate:
 45  rsqrt_inaccurate:
 46      movd xmm0, edi
 47  
 48      rsqrtss xmm0, xmm0
 49  
 50      movd eax, xmm0
 51      ret
 52  
 53  _rsqrt_full:
 54  rsqrt_full:
 55      movd xmm0, edi
 56  
 57      pand xmm0, [rip + top_mask]
 58      por xmm0, [rip + penultimate_bit]
 59  
 60      vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
 61      ptest xmm1, xmm1
 62      jnz rsqrt_full_bad
 63  
 64      sqrtss xmm0, xmm0
 65  
 66      movd xmm1, [rip + one]
 67      divss xmm1, xmm0
 68  
 69      paddd xmm1, [rip + ultimate_bit]
 70      pand xmm1, [rip + top_mask]
 71  
 72      movd eax, xmm1
 73      ret
 74  
 75  _rsqrt_full_gpr:
 76  rsqrt_full_gpr:
 77      movd eax, xmm0 # Emulate regalloc mov
 78  
 79      mov eax, edi
 80      and eax, 0xFFFF8000
 81      or eax, 0x00008000
 82  
 83      movd xmm0, eax
 84      vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
 85      ptest xmm1, xmm1
 86      jnz rsqrt_full_bad
 87  
 88      sqrtss xmm0, xmm0
 89  
 90      movd xmm1, [rip + one]
 91      divss xmm1, xmm0
 92      movd eax, xmm1
 93  
 94      add eax, 0x00004000
 95      and eax, 0xffff8000
 96  
 97      movd xmm0, eax # Emulate regalloc mov
 98      ret
 99  
100  _rsqrt_full_nb2:
101  rsqrt_full_nb2:
102      movd xmm0, edi
103  
104      pand xmm0, [rip + top_mask]
105      por xmm0, [rip + penultimate_bit]
106  
107      ucomiss xmm0, [rip + min_pos_denorm]
108      jna rsqrt_full_bad_new1
109  
110      sqrtss xmm0, xmm0
111  
112      movd xmm1, [rip + one]
113      divss xmm1, xmm0
114  
115      paddd xmm1, [rip + ultimate_bit]
116      pand xmm1, [rip + top_mask]
117  
118      movd eax, xmm1
119      ret
120  
121  _rsqrt_full_nb:
122  rsqrt_full_nb:
123      movd xmm0, edi
124  
125      pand xmm0, [rip + top_mask]
126      por xmm0, [rip + penultimate_bit]
127  
128      vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
129      ptest xmm1, xmm1
130      jnz rsqrt_full_bad_new1
131  
132      sqrtss xmm0, xmm0
133  
134      movd xmm1, [rip + one]
135      divss xmm1, xmm0
136  
137      paddd xmm1, [rip + ultimate_bit]
138      pand xmm1, [rip + top_mask]
139  
140      movd eax, xmm1
141      ret
142  
143  rsqrt_full_bad_new1:
144      cmp edi, 0x00800000
145      jb rsqrt_full_bad_new_fallback1
146  
147      movd xmm0, edi
148      rsqrtss xmm1, xmm0
149  
150      ucomiss xmm1, xmm1
151      jp rsqrt_full_bad_new1_nan
152  
153      movd eax, xmm1
154      ret
155  
156  rsqrt_full_bad_new_fallback1:
157      call _rsqrt_fallback
158      ret
159  
160  rsqrt_full_bad_new1_nan:
161      ucomiss xmm0, xmm0
162      jp rsqrt_full_bad_new1_nan_ret
163  
164      mov eax, 0x7FC00000
165      ret
166  
167  rsqrt_full_bad_new1_nan_ret:
168      ret
169  
170  _rsqrt_full_nb_gpr:
171  rsqrt_full_nb_gpr:
172      movd eax, xmm0 # Emulate regalloc mov
173  
174      mov eax, edi
175      and eax, 0xFFFF8000
176      or eax, 0x00008000
177  
178      movd xmm0, eax
179      vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
180      ptest xmm1, xmm1
181      jnz rsqrt_full_bad_new2
182  
183      sqrtss xmm0, xmm0
184  
185      movd xmm1, [rip + one]
186      divss xmm1, xmm0
187      movd eax, xmm1
188  
189      add eax, 0x00004000
190      and eax, 0xffff8000
191  
192      movd xmm0, eax # Emulate regalloc mov
193      ret
194  
195  rsqrt_full_bad_new2:
196      cmp edi, 0x00800000
197      jb rsqrt_full_bad_new_fallback2
198  
199      movd xmm0, edi
200      rsqrtss xmm1, xmm0
201  
202      test edi, edi
203      js rsqrt_full_bad_new2_nan
204  
205      movd eax, xmm1
206      ret
207  
208  rsqrt_full_bad_new_fallback2:
209      call _rsqrt_fallback
210      ret
211  
212  rsqrt_full_bad_new2_nan:
213      mov eax, 0x7FC00000
214      ret
215  
216  rsqrt_full_bad:
217      xorps xmm1, xmm1
218      movd xmm0, edi
219      ucomiss xmm0, xmm1
220      jp rsqrt_full_nan
221      je rsqrt_full_zero
222      jc rsqrt_full_neg
223  
224      cmp edi, 0x7F800000
225      je rsqrt_full_inf
226  
227      # TODO: Full Denormal Implementation
228      call _rsqrt_fallback
229      ret
230  
231  rsqrt_full_neg:
232      mov eax, 0x7FC00000
233      ret
234  
235  rsqrt_full_inf:
236      xor eax, eax
237      ret
238  
239  rsqrt_full_nan:
240      mov eax, edi
241      or eax, 0x00400000
242      ret
243  
244  rsqrt_full_zero:
245      mov eax, edi
246      or eax, 0x7F800000
247      ret
248  
249  _rsqrt_newton:
250  rsqrt_newton:
251      movd xmm0, edi
252  
253      pand xmm0, [rip + top_mask]
254      por xmm0, [rip + penultimate_bit]
255  
256      vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
257      ptest xmm1, xmm1
258      jnz rsqrt_full_bad
259  
260      rsqrtps xmm1, xmm0
261      mulss xmm0, [rip + half]
262      vmulss xmm2, xmm1, xmm1
263      mulss xmm2, xmm0
264      movaps xmm0, [rip + one_point_five]
265      subss xmm0, xmm2
266      mulss xmm0, xmm1
267  
268      paddd xmm0, [rip + ultimate_bit]
269      pand xmm0, [rip + top_mask]
270  
271      movd eax, xmm0
272      ret
273  
274  _rsqrt_hack:
275  rsqrt_hack:
276      movd xmm9, edi
277  
278      vpand xmm0, xmm9, [rip + top_mask]
279      por xmm0, [rip + penultimate_bit]
280  
281      # detect NaNs, negatives, zeros, denormals and infinities
282      vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
283      ptest xmm1, xmm1
284      jnz rsqrt_full_bad
285  
286      # calculate x64 estimate
287      rsqrtps xmm0, xmm0
288  
289      # calculate correction factor
290      vpslld xmm1, xmm9, 8
291      vpsrad xmm2, xmm1, 31
292      paddd xmm1, [rip + magic1]
293      pcmpgtd xmm1, [rip + magic2]
294      pxor xmm1, xmm2
295      movaps xmm2, [rip + magic3]
296      psubd xmm2, xmm1
297  
298      # correct x64 estimate
299      paddd xmm0, xmm2
300      pand xmm0, [rip + top_mask]
301  
302      movd eax, xmm0
303      ret