rsqrt_test_fn.s
1 .global _rsqrt_inaccurate 2 .global rsqrt_inaccurate 3 .global _rsqrt_full 4 .global rsqrt_full 5 .global _rsqrt_full_gpr 6 .global rsqrt_full_gpr 7 .global _rsqrt_full_nb 8 .global rsqrt_full_nb 9 .global _rsqrt_full_nb2 10 .global rsqrt_full_nb2 11 .global _rsqrt_full_nb_gpr 12 .global rsqrt_full_nb_gpr 13 .global _rsqrt_newton 14 .global rsqrt_newton 15 .global _rsqrt_hack 16 .global rsqrt_hack 17 .global _rsqrt_fallback 18 19 .text 20 .intel_syntax noprefix 21 22 .align 16 23 min_pos_denorm: 24 .long 0x00800000,0,0,0 25 penultimate_bit: 26 .long 0x00008000,0,0,0 27 ultimate_bit: 28 .long 0x00004000,0,0,0 29 top_mask: 30 .long 0xFFFF8000,0,0,0 31 one: 32 .long 0x3f800000,0,0,0 33 half: 34 .long 0x3f000000,0,0,0 35 one_point_five: 36 .long 0x3fc00000,0,0,0 37 magic1: 38 .long 0x60000000,0,0,0 39 magic2: 40 .long 0x3c000000,0,0,0 41 magic3: 42 .long 0x000047ff,0,0,0 43 44 _rsqrt_inaccurate: 45 rsqrt_inaccurate: 46 movd xmm0, edi 47 48 rsqrtss xmm0, xmm0 49 50 movd eax, xmm0 51 ret 52 53 _rsqrt_full: 54 rsqrt_full: 55 movd xmm0, edi 56 57 pand xmm0, [rip + top_mask] 58 por xmm0, [rip + penultimate_bit] 59 60 vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] 61 ptest xmm1, xmm1 62 jnz rsqrt_full_bad 63 64 sqrtss xmm0, xmm0 65 66 movd xmm1, [rip + one] 67 divss xmm1, xmm0 68 69 paddd xmm1, [rip + ultimate_bit] 70 pand xmm1, [rip + top_mask] 71 72 movd eax, xmm1 73 ret 74 75 _rsqrt_full_gpr: 76 rsqrt_full_gpr: 77 movd eax, xmm0 # Emulate regalloc mov 78 79 mov eax, edi 80 and eax, 0xFFFF8000 81 or eax, 0x00008000 82 83 movd xmm0, eax 84 vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] 85 ptest xmm1, xmm1 86 jnz rsqrt_full_bad 87 88 sqrtss xmm0, xmm0 89 90 movd xmm1, [rip + one] 91 divss xmm1, xmm0 92 movd eax, xmm1 93 94 add eax, 0x00004000 95 and eax, 0xffff8000 96 97 movd xmm0, eax # Emulate regalloc mov 98 ret 99 100 _rsqrt_full_nb2: 101 rsqrt_full_nb2: 102 movd xmm0, edi 103 104 pand xmm0, [rip + top_mask] 105 por xmm0, [rip + penultimate_bit] 106 107 ucomiss xmm0, [rip + min_pos_denorm] 108 jna rsqrt_full_bad_new1 109 110 sqrtss xmm0, xmm0 111 112 movd xmm1, [rip + one] 113 divss xmm1, xmm0 114 115 paddd xmm1, [rip + ultimate_bit] 116 pand xmm1, [rip + top_mask] 117 118 movd eax, xmm1 119 ret 120 121 _rsqrt_full_nb: 122 rsqrt_full_nb: 123 movd xmm0, edi 124 125 pand xmm0, [rip + top_mask] 126 por xmm0, [rip + penultimate_bit] 127 128 vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] 129 ptest xmm1, xmm1 130 jnz rsqrt_full_bad_new1 131 132 sqrtss xmm0, xmm0 133 134 movd xmm1, [rip + one] 135 divss xmm1, xmm0 136 137 paddd xmm1, [rip + ultimate_bit] 138 pand xmm1, [rip + top_mask] 139 140 movd eax, xmm1 141 ret 142 143 rsqrt_full_bad_new1: 144 cmp edi, 0x00800000 145 jb rsqrt_full_bad_new_fallback1 146 147 movd xmm0, edi 148 rsqrtss xmm1, xmm0 149 150 ucomiss xmm1, xmm1 151 jp rsqrt_full_bad_new1_nan 152 153 movd eax, xmm1 154 ret 155 156 rsqrt_full_bad_new_fallback1: 157 call _rsqrt_fallback 158 ret 159 160 rsqrt_full_bad_new1_nan: 161 ucomiss xmm0, xmm0 162 jp rsqrt_full_bad_new1_nan_ret 163 164 mov eax, 0x7FC00000 165 ret 166 167 rsqrt_full_bad_new1_nan_ret: 168 ret 169 170 _rsqrt_full_nb_gpr: 171 rsqrt_full_nb_gpr: 172 movd eax, xmm0 # Emulate regalloc mov 173 174 mov eax, edi 175 and eax, 0xFFFF8000 176 or eax, 0x00008000 177 178 movd xmm0, eax 179 vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] 180 ptest xmm1, xmm1 181 jnz rsqrt_full_bad_new2 182 183 sqrtss xmm0, xmm0 184 185 movd xmm1, [rip + one] 186 divss xmm1, xmm0 187 movd eax, xmm1 188 189 add eax, 0x00004000 190 and eax, 0xffff8000 191 192 movd xmm0, eax # Emulate regalloc mov 193 ret 194 195 rsqrt_full_bad_new2: 196 cmp edi, 0x00800000 197 jb rsqrt_full_bad_new_fallback2 198 199 movd xmm0, edi 200 rsqrtss xmm1, xmm0 201 202 test edi, edi 203 js rsqrt_full_bad_new2_nan 204 205 movd eax, xmm1 206 ret 207 208 rsqrt_full_bad_new_fallback2: 209 call _rsqrt_fallback 210 ret 211 212 rsqrt_full_bad_new2_nan: 213 mov eax, 0x7FC00000 214 ret 215 216 rsqrt_full_bad: 217 xorps xmm1, xmm1 218 movd xmm0, edi 219 ucomiss xmm0, xmm1 220 jp rsqrt_full_nan 221 je rsqrt_full_zero 222 jc rsqrt_full_neg 223 224 cmp edi, 0x7F800000 225 je rsqrt_full_inf 226 227 # TODO: Full Denormal Implementation 228 call _rsqrt_fallback 229 ret 230 231 rsqrt_full_neg: 232 mov eax, 0x7FC00000 233 ret 234 235 rsqrt_full_inf: 236 xor eax, eax 237 ret 238 239 rsqrt_full_nan: 240 mov eax, edi 241 or eax, 0x00400000 242 ret 243 244 rsqrt_full_zero: 245 mov eax, edi 246 or eax, 0x7F800000 247 ret 248 249 _rsqrt_newton: 250 rsqrt_newton: 251 movd xmm0, edi 252 253 pand xmm0, [rip + top_mask] 254 por xmm0, [rip + penultimate_bit] 255 256 vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] 257 ptest xmm1, xmm1 258 jnz rsqrt_full_bad 259 260 rsqrtps xmm1, xmm0 261 mulss xmm0, [rip + half] 262 vmulss xmm2, xmm1, xmm1 263 mulss xmm2, xmm0 264 movaps xmm0, [rip + one_point_five] 265 subss xmm0, xmm2 266 mulss xmm0, xmm1 267 268 paddd xmm0, [rip + ultimate_bit] 269 pand xmm0, [rip + top_mask] 270 271 movd eax, xmm0 272 ret 273 274 _rsqrt_hack: 275 rsqrt_hack: 276 movd xmm9, edi 277 278 vpand xmm0, xmm9, [rip + top_mask] 279 por xmm0, [rip + penultimate_bit] 280 281 # detect NaNs, negatives, zeros, denormals and infinities 282 vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm] 283 ptest xmm1, xmm1 284 jnz rsqrt_full_bad 285 286 # calculate x64 estimate 287 rsqrtps xmm0, xmm0 288 289 # calculate correction factor 290 vpslld xmm1, xmm9, 8 291 vpsrad xmm2, xmm1, 31 292 paddd xmm1, [rip + magic1] 293 pcmpgtd xmm1, [rip + magic2] 294 pxor xmm1, xmm2 295 movaps xmm2, [rip + magic3] 296 psubd xmm2, xmm1 297 298 # correct x64 estimate 299 paddd xmm0, xmm2 300 pand xmm0, [rip + top_mask] 301 302 movd eax, xmm0 303 ret