InstEmitSimdArithmetic32.cs
1 using ARMeilleure.Decoders; 2 using ARMeilleure.IntermediateRepresentation; 3 using ARMeilleure.Translation; 4 using System; 5 using static ARMeilleure.Instructions.InstEmitFlowHelper; 6 using static ARMeilleure.Instructions.InstEmitHelper; 7 using static ARMeilleure.Instructions.InstEmitSimdHelper; 8 using static ARMeilleure.Instructions.InstEmitSimdHelper32; 9 using static ARMeilleure.IntermediateRepresentation.Operand.Factory; 10 11 namespace ARMeilleure.Instructions 12 { 13 static partial class InstEmit32 14 { 15 public static void Vabd_I(ArmEmitterContext context) 16 { 17 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 18 19 EmitVectorBinaryOpI32(context, (op1, op2) => EmitAbs(context, context.Subtract(op1, op2)), !op.U); 20 } 21 22 public static void Vabdl_I(ArmEmitterContext context) 23 { 24 OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; 25 26 EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitAbs(context, context.Subtract(op1, op2)), !op.U); 27 } 28 29 public static void Vabs_S(ArmEmitterContext context) 30 { 31 OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; 32 33 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 34 { 35 InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FabsS); 36 } 37 else if (Optimizations.FastFP && Optimizations.UseSse2) 38 { 39 EmitScalarUnaryOpSimd32(context, (m) => 40 { 41 return EmitFloatAbs(context, m, (op.Size & 1) == 0, false); 42 }); 43 } 44 else 45 { 46 EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Abs), op1)); 47 } 48 } 49 50 public static void Vabs_V(ArmEmitterContext context) 51 { 52 OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; 53 54 if (op.F) 55 { 56 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 57 { 58 InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FabsV); 59 } 60 else if (Optimizations.FastFP && Optimizations.UseSse2) 61 { 62 EmitVectorUnaryOpSimd32(context, (m) => 63 { 64 return EmitFloatAbs(context, m, (op.Size & 1) == 0, true); 65 }); 66 } 67 else 68 { 69 EmitVectorUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, nameof(Math.Abs), op1)); 70 } 71 } 72 else 73 { 74 EmitVectorUnaryOpSx32(context, (op1) => EmitAbs(context, op1)); 75 } 76 } 77 78 private static Operand EmitAbs(ArmEmitterContext context, Operand value) 79 { 80 Operand isPositive = context.ICompareGreaterOrEqual(value, Const(value.Type, 0)); 81 82 return context.ConditionalSelect(isPositive, value, context.Negate(value)); 83 } 84 85 public static void Vadd_S(ArmEmitterContext context) 86 { 87 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 88 { 89 InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FaddS); 90 } 91 else if (Optimizations.FastFP && Optimizations.UseSse2) 92 { 93 EmitScalarBinaryOpF32(context, Intrinsic.X86Addss, Intrinsic.X86Addsd); 94 } 95 else if (Optimizations.FastFP) 96 { 97 EmitScalarBinaryOpF32(context, (op1, op2) => context.Add(op1, op2)); 98 } 99 else 100 { 101 EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2)); 102 } 103 } 104 105 public static void Vadd_V(ArmEmitterContext context) 106 { 107 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 108 { 109 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FaddV); 110 } 111 else if (Optimizations.FastFP && Optimizations.UseSse2) 112 { 113 EmitVectorBinaryOpF32(context, Intrinsic.X86Addps, Intrinsic.X86Addpd); 114 } 115 else if (Optimizations.FastFP) 116 { 117 EmitVectorBinaryOpF32(context, (op1, op2) => context.Add(op1, op2)); 118 } 119 else 120 { 121 EmitVectorBinaryOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPAddFpscr), op1, op2)); 122 } 123 } 124 125 public static void Vadd_I(ArmEmitterContext context) 126 { 127 if (Optimizations.UseSse2) 128 { 129 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 130 EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PaddInstruction[op.Size], op1, op2)); 131 } 132 else 133 { 134 EmitVectorBinaryOpZx32(context, (op1, op2) => context.Add(op1, op2)); 135 } 136 } 137 138 public static void Vaddl_I(ArmEmitterContext context) 139 { 140 OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; 141 142 EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); 143 } 144 145 public static void Vaddw_I(ArmEmitterContext context) 146 { 147 OpCode32SimdRegWide op = (OpCode32SimdRegWide)context.CurrOp; 148 149 EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); 150 } 151 152 public static void Vcnt(ArmEmitterContext context) 153 { 154 OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; 155 156 Operand res = GetVecA32(op.Qd); 157 158 int elems = op.GetBytesCount(); 159 160 for (int index = 0; index < elems; index++) 161 { 162 Operand de; 163 Operand me = EmitVectorExtractZx32(context, op.Qm, op.Im + index, op.Size); 164 165 if (Optimizations.UsePopCnt) 166 { 167 de = context.AddIntrinsicInt(Intrinsic.X86Popcnt, me); 168 } 169 else 170 { 171 de = EmitCountSetBits8(context, me); 172 } 173 174 res = EmitVectorInsert(context, res, de, op.Id + index, op.Size); 175 } 176 177 context.Copy(GetVecA32(op.Qd), res); 178 } 179 180 public static void Vdup(ArmEmitterContext context) 181 { 182 OpCode32SimdDupGP op = (OpCode32SimdDupGP)context.CurrOp; 183 184 Operand insert = GetIntA32(context, op.Rt); 185 186 // Zero extend into an I64, then replicate. Saves the most time over elementwise inserts. 187 insert = op.Size switch 188 { 189 2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)), 190 1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)), 191 0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)), 192 _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\"."), 193 }; 194 195 InsertScalar(context, op.Vd, insert); 196 if (op.Q) 197 { 198 InsertScalar(context, op.Vd + 1, insert); 199 } 200 } 201 202 public static void Vdup_1(ArmEmitterContext context) 203 { 204 OpCode32SimdDupElem op = (OpCode32SimdDupElem)context.CurrOp; 205 206 Operand insert = EmitVectorExtractZx32(context, op.Vm >> 1, ((op.Vm & 1) << (3 - op.Size)) + op.Index, op.Size); 207 208 // Zero extend into an I64, then replicate. Saves the most time over elementwise inserts. 209 insert = op.Size switch 210 { 211 2 => context.Multiply(context.ZeroExtend32(OperandType.I64, insert), Const(0x0000000100000001u)), 212 1 => context.Multiply(context.ZeroExtend16(OperandType.I64, insert), Const(0x0001000100010001u)), 213 0 => context.Multiply(context.ZeroExtend8(OperandType.I64, insert), Const(0x0101010101010101u)), 214 _ => throw new InvalidOperationException($"Invalid Vdup size \"{op.Size}\"."), 215 }; 216 217 InsertScalar(context, op.Vd, insert); 218 if (op.Q) 219 { 220 InsertScalar(context, op.Vd | 1, insert); 221 } 222 } 223 224 private static (long, long) MaskHelperByteSequence(int start, int length, int startByte) 225 { 226 int end = start + length; 227 int b = startByte; 228 long result = 0; 229 long result2 = 0; 230 for (int i = 0; i < 8; i++) 231 { 232 result |= (long)((i >= end || i < start) ? 0x80 : b++) << (i * 8); 233 } 234 for (int i = 8; i < 16; i++) 235 { 236 result2 |= (long)((i >= end || i < start) ? 0x80 : b++) << ((i - 8) * 8); 237 } 238 return (result2, result); 239 } 240 241 public static void Vext(ArmEmitterContext context) 242 { 243 OpCode32SimdExt op = (OpCode32SimdExt)context.CurrOp; 244 int elems = op.GetBytesCount(); 245 int byteOff = op.Immediate; 246 247 if (Optimizations.UseSsse3) 248 { 249 EmitVectorBinaryOpSimd32(context, (n, m) => 250 { 251 // Writing low to high of d: start <imm> into n, overlap into m. 252 // Then rotate n down by <imm>, m up by (elems)-imm. 253 // Then OR them together for the result. 254 255 (long nMaskHigh, long nMaskLow) = MaskHelperByteSequence(0, elems - byteOff, byteOff); 256 (long mMaskHigh, long mMaskLow) = MaskHelperByteSequence(elems - byteOff, byteOff, 0); 257 Operand nMask, mMask; 258 if (!op.Q) 259 { 260 // Do the same operation to the bytes in the top doubleword too, as our target could be in either. 261 nMaskHigh = nMaskLow + 0x0808080808080808L; 262 mMaskHigh = mMaskLow + 0x0808080808080808L; 263 } 264 nMask = X86GetElements(context, nMaskHigh, nMaskLow); 265 mMask = X86GetElements(context, mMaskHigh, mMaskLow); 266 Operand nPart = context.AddIntrinsic(Intrinsic.X86Pshufb, n, nMask); 267 Operand mPart = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mMask); 268 269 return context.AddIntrinsic(Intrinsic.X86Por, nPart, mPart); 270 }); 271 } 272 else 273 { 274 Operand res = GetVecA32(op.Qd); 275 276 for (int index = 0; index < elems; index++) 277 { 278 Operand extract; 279 280 if (byteOff >= elems) 281 { 282 extract = EmitVectorExtractZx32(context, op.Qm, op.Im + (byteOff - elems), op.Size); 283 } 284 else 285 { 286 extract = EmitVectorExtractZx32(context, op.Qn, op.In + byteOff, op.Size); 287 } 288 byteOff++; 289 290 res = EmitVectorInsert(context, res, extract, op.Id + index, op.Size); 291 } 292 293 context.Copy(GetVecA32(op.Qd), res); 294 } 295 } 296 297 public static void Vfma_S(ArmEmitterContext context) // Fused. 298 { 299 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 300 { 301 InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FmaddS); 302 } 303 else if (Optimizations.FastFP && Optimizations.UseFma) 304 { 305 EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmadd231ss, Intrinsic.X86Vfmadd231sd); 306 } 307 else if (Optimizations.FastFP && Optimizations.UseSse2) 308 { 309 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); 310 } 311 else 312 { 313 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 314 { 315 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3); 316 }); 317 } 318 } 319 320 public static void Vfma_V(ArmEmitterContext context) // Fused. 321 { 322 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 323 { 324 InstEmitSimdHelper32Arm64.EmitVectorTernaryOpF32(context, Intrinsic.Arm64FmlaV); 325 } 326 else if (Optimizations.FastFP && Optimizations.UseFma) 327 { 328 EmitVectorTernaryOpF32(context, Intrinsic.X86Vfmadd231ps); 329 } 330 else 331 { 332 EmitVectorTernaryOpF32(context, (op1, op2, op3) => 333 { 334 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3); 335 }); 336 } 337 } 338 339 public static void Vfms_S(ArmEmitterContext context) // Fused. 340 { 341 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 342 { 343 InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FmsubS); 344 } 345 else if (Optimizations.FastFP && Optimizations.UseFma) 346 { 347 EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmadd231ss, Intrinsic.X86Vfnmadd231sd); 348 } 349 else if (Optimizations.FastFP && Optimizations.UseSse2) 350 { 351 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd); 352 } 353 else 354 { 355 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 356 { 357 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3); 358 }); 359 } 360 } 361 362 public static void Vfms_V(ArmEmitterContext context) // Fused. 363 { 364 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 365 { 366 InstEmitSimdHelper32Arm64.EmitVectorTernaryOpF32(context, Intrinsic.Arm64FmlsV); 367 } 368 else if (Optimizations.FastFP && Optimizations.UseFma) 369 { 370 EmitVectorTernaryOpF32(context, Intrinsic.X86Vfnmadd231ps); 371 } 372 else 373 { 374 EmitVectorTernaryOpF32(context, (op1, op2, op3) => 375 { 376 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3); 377 }); 378 } 379 } 380 381 public static void Vfnma_S(ArmEmitterContext context) // Fused. 382 { 383 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 384 { 385 InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FnmaddS); 386 } 387 else if (Optimizations.FastFP && Optimizations.UseFma) 388 { 389 EmitScalarTernaryOpF32(context, Intrinsic.X86Vfnmsub231ss, Intrinsic.X86Vfnmsub231sd); 390 } 391 else if (Optimizations.FastFP && Optimizations.UseSse2) 392 { 393 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true); 394 } 395 else 396 { 397 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 398 { 399 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3); 400 }); 401 } 402 } 403 404 public static void Vfnms_S(ArmEmitterContext context) // Fused. 405 { 406 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 407 { 408 InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FnmsubS); 409 } 410 else if (Optimizations.FastFP && Optimizations.UseFma) 411 { 412 EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmsub231ss, Intrinsic.X86Vfmsub231sd); 413 } 414 else if (Optimizations.FastFP && Optimizations.UseSse2) 415 { 416 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true); 417 } 418 else 419 { 420 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 421 { 422 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3); 423 }); 424 } 425 } 426 427 public static void Vhadd(ArmEmitterContext context) 428 { 429 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 430 431 if (op.U) 432 { 433 EmitVectorBinaryOpZx32(context, (op1, op2) => context.ShiftRightUI(context.Add(op1, op2), Const(1))); 434 } 435 else 436 { 437 EmitVectorBinaryOpSx32(context, (op1, op2) => context.ShiftRightSI(context.Add(op1, op2), Const(1))); 438 } 439 } 440 441 public static void Vmov_S(ArmEmitterContext context) 442 { 443 if (Optimizations.FastFP && Optimizations.UseSse2) 444 { 445 EmitScalarUnaryOpF32(context, 0, 0); 446 } 447 else 448 { 449 EmitScalarUnaryOpF32(context, (op1) => op1); 450 } 451 } 452 453 public static void Vmovn(ArmEmitterContext context) 454 { 455 EmitVectorUnaryNarrowOp32(context, (op1) => op1); 456 } 457 458 public static void Vneg_S(ArmEmitterContext context) 459 { 460 OpCode32SimdS op = (OpCode32SimdS)context.CurrOp; 461 462 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 463 { 464 InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FnegS); 465 } 466 else if (Optimizations.UseSse2) 467 { 468 EmitScalarUnaryOpSimd32(context, (m) => 469 { 470 if ((op.Size & 1) == 0) 471 { 472 Operand mask = X86GetScalar(context, -0f); 473 return context.AddIntrinsic(Intrinsic.X86Xorps, mask, m); 474 } 475 else 476 { 477 Operand mask = X86GetScalar(context, -0d); 478 return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m); 479 } 480 }); 481 } 482 else 483 { 484 EmitScalarUnaryOpF32(context, (op1) => context.Negate(op1)); 485 } 486 } 487 488 public static void Vnmul_S(ArmEmitterContext context) 489 { 490 OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; 491 492 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 493 { 494 InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FnmulS); 495 } 496 else if (Optimizations.UseSse2) 497 { 498 EmitScalarBinaryOpSimd32(context, (n, m) => 499 { 500 if ((op.Size & 1) == 0) 501 { 502 Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); 503 Operand mask = X86GetScalar(context, -0f); 504 return context.AddIntrinsic(Intrinsic.X86Xorps, mask, res); 505 } 506 else 507 { 508 Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); 509 Operand mask = X86GetScalar(context, -0d); 510 return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, res); 511 } 512 }); 513 } 514 else 515 { 516 EmitScalarBinaryOpF32(context, (op1, op2) => context.Negate(context.Multiply(op1, op2))); 517 } 518 } 519 520 public static void Vnmla_S(ArmEmitterContext context) 521 { 522 OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; 523 524 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 525 { 526 InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FnmaddS); 527 } 528 else if (Optimizations.FastFP && Optimizations.UseSse2) 529 { 530 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd, isNegD: true); 531 } 532 else if (Optimizations.FastFP) 533 { 534 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 535 { 536 return context.Subtract(context.Negate(op1), context.Multiply(op2, op3)); 537 }); 538 } 539 else 540 { 541 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 542 { 543 Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); 544 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), context.Negate(op1), res); 545 }); 546 } 547 } 548 549 public static void Vnmls_S(ArmEmitterContext context) 550 { 551 OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp; 552 553 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 554 { 555 InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FnmsubS); 556 } 557 else if (Optimizations.FastFP && Optimizations.UseSse2) 558 { 559 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd, isNegD: true); 560 } 561 else if (Optimizations.FastFP) 562 { 563 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 564 { 565 return context.Add(context.Negate(op1), context.Multiply(op2, op3)); 566 }); 567 } 568 else 569 { 570 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 571 { 572 Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); 573 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), context.Negate(op1), res); 574 }); 575 } 576 } 577 578 public static void Vneg_V(ArmEmitterContext context) 579 { 580 OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp; 581 582 if (op.F) 583 { 584 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 585 { 586 InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FnegV); 587 } 588 else if (Optimizations.FastFP && Optimizations.UseSse2) 589 { 590 EmitVectorUnaryOpSimd32(context, (m) => 591 { 592 if ((op.Size & 1) == 0) 593 { 594 Operand mask = X86GetAllElements(context, -0f); 595 return context.AddIntrinsic(Intrinsic.X86Xorps, mask, m); 596 } 597 else 598 { 599 Operand mask = X86GetAllElements(context, -0d); 600 return context.AddIntrinsic(Intrinsic.X86Xorpd, mask, m); 601 } 602 }); 603 } 604 else 605 { 606 EmitVectorUnaryOpF32(context, (op1) => context.Negate(op1)); 607 } 608 } 609 else 610 { 611 EmitVectorUnaryOpSx32(context, (op1) => context.Negate(op1)); 612 } 613 } 614 615 public static void Vdiv_S(ArmEmitterContext context) 616 { 617 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 618 { 619 InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FdivS); 620 } 621 else if (Optimizations.FastFP && Optimizations.UseSse2) 622 { 623 EmitScalarBinaryOpF32(context, Intrinsic.X86Divss, Intrinsic.X86Divsd); 624 } 625 else if (Optimizations.FastFP) 626 { 627 EmitScalarBinaryOpF32(context, (op1, op2) => context.Divide(op1, op2)); 628 } 629 else 630 { 631 EmitScalarBinaryOpF32(context, (op1, op2) => 632 { 633 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv), op1, op2); 634 }); 635 } 636 } 637 638 public static void Vmaxnm_S(ArmEmitterContext context) 639 { 640 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 641 { 642 InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FmaxnmS); 643 } 644 else if (Optimizations.FastFP && Optimizations.UseSse41) 645 { 646 EmitSse41MaxMinNumOpF32(context, true, true); 647 } 648 else 649 { 650 EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2)); 651 } 652 } 653 654 public static void Vmaxnm_V(ArmEmitterContext context) 655 { 656 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 657 { 658 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FmaxnmV); 659 } 660 else if (Optimizations.FastFP && Optimizations.UseSse41) 661 { 662 EmitSse41MaxMinNumOpF32(context, true, false); 663 } 664 else 665 { 666 EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMaxNumFpscr), op1, op2)); 667 } 668 } 669 670 public static void Vminnm_S(ArmEmitterContext context) 671 { 672 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 673 { 674 InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FminnmS); 675 } 676 else if (Optimizations.FastFP && Optimizations.UseSse41) 677 { 678 EmitSse41MaxMinNumOpF32(context, false, true); 679 } 680 else 681 { 682 EmitScalarBinaryOpF32(context, (op1, op2) => EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2)); 683 } 684 } 685 686 public static void Vminnm_V(ArmEmitterContext context) 687 { 688 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 689 { 690 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FminnmV); 691 } 692 else if (Optimizations.FastFP && Optimizations.UseSse41) 693 { 694 EmitSse41MaxMinNumOpF32(context, false, false); 695 } 696 else 697 { 698 EmitVectorBinaryOpSx32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinNumFpscr), op1, op2)); 699 } 700 } 701 702 public static void Vmax_V(ArmEmitterContext context) 703 { 704 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 705 { 706 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FmaxV); 707 } 708 else if (Optimizations.FastFP && Optimizations.UseSse2) 709 { 710 EmitVectorBinaryOpF32(context, Intrinsic.X86Maxps, Intrinsic.X86Maxpd); 711 } 712 else 713 { 714 EmitVectorBinaryOpF32(context, (op1, op2) => 715 { 716 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMaxFpscr), op1, op2); 717 }); 718 } 719 } 720 721 public static void Vmax_I(ArmEmitterContext context) 722 { 723 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 724 725 if (op.U) 726 { 727 if (Optimizations.UseSse2) 728 { 729 EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxuInstruction[op.Size], op1, op2)); 730 } 731 else 732 { 733 EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreaterUI(op1, op2), op1, op2)); 734 } 735 } 736 else 737 { 738 if (Optimizations.UseSse2) 739 { 740 EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PmaxsInstruction[op.Size], op1, op2)); 741 } 742 else 743 { 744 EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareGreater(op1, op2), op1, op2)); 745 } 746 } 747 } 748 749 public static void Vmin_V(ArmEmitterContext context) 750 { 751 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 752 { 753 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FminV); 754 } 755 else if (Optimizations.FastFP && Optimizations.UseSse2) 756 { 757 EmitVectorBinaryOpF32(context, Intrinsic.X86Minps, Intrinsic.X86Minpd); 758 } 759 else 760 { 761 EmitVectorBinaryOpF32(context, (op1, op2) => 762 { 763 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinFpscr), op1, op2); 764 }); 765 } 766 } 767 768 public static void Vmin_I(ArmEmitterContext context) 769 { 770 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 771 772 if (op.U) 773 { 774 if (Optimizations.UseSse2) 775 { 776 EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminuInstruction[op.Size], op1, op2)); 777 } 778 else 779 { 780 EmitVectorBinaryOpZx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLessUI(op1, op2), op1, op2)); 781 } 782 } 783 else 784 { 785 if (Optimizations.UseSse2) 786 { 787 EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PminsInstruction[op.Size], op1, op2)); 788 } 789 else 790 { 791 EmitVectorBinaryOpSx32(context, (op1, op2) => context.ConditionalSelect(context.ICompareLess(op1, op2), op1, op2)); 792 } 793 } 794 } 795 796 public static void Vmla_S(ArmEmitterContext context) 797 { 798 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 799 { 800 InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FmaddS); 801 } 802 else if (Optimizations.FastFP && Optimizations.UseSse2) 803 { 804 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Addss, Intrinsic.X86Addsd); 805 } 806 else if (Optimizations.FastFP) 807 { 808 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 809 { 810 return context.Add(op1, context.Multiply(op2, op3)); 811 }); 812 } 813 else 814 { 815 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 816 { 817 Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); 818 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, res); 819 }); 820 } 821 } 822 823 public static void Vmla_V(ArmEmitterContext context) 824 { 825 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 826 { 827 InstEmitSimdHelper32Arm64.EmitVectorTernaryOpF32(context, Intrinsic.Arm64FmlaV); 828 } 829 else if (Optimizations.FastFP && Optimizations.UseSse2) 830 { 831 EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); 832 } 833 else if (Optimizations.FastFP) 834 { 835 EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); 836 } 837 else 838 { 839 EmitVectorTernaryOpF32(context, (op1, op2, op3) => 840 { 841 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3); 842 }); 843 } 844 } 845 846 public static void Vmla_I(ArmEmitterContext context) 847 { 848 EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); 849 } 850 851 public static void Vmla_1(ArmEmitterContext context) 852 { 853 OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; 854 855 if (op.F) 856 { 857 if (Optimizations.FastFP && Optimizations.UseSse2) 858 { 859 EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Addps, Intrinsic.X86Addpd); 860 } 861 else if (Optimizations.FastFP) 862 { 863 EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3))); 864 } 865 else 866 { 867 EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulAddFpscr), op1, op2, op3)); 868 } 869 } 870 else 871 { 872 EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Add(op1, context.Multiply(op2, op3)), false); 873 } 874 } 875 876 public static void Vmlal_I(ArmEmitterContext context) 877 { 878 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 879 880 EmitVectorTernaryLongOpI32(context, (d, n, m) => context.Add(d, context.Multiply(n, m)), !op.U); 881 } 882 883 public static void Vmls_S(ArmEmitterContext context) 884 { 885 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 886 { 887 InstEmitSimdHelper32Arm64.EmitScalarTernaryOpF32(context, Intrinsic.Arm64FmlsV); 888 } 889 else if (Optimizations.FastFP && Optimizations.UseSse2) 890 { 891 EmitScalarTernaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd, Intrinsic.X86Subss, Intrinsic.X86Subsd); 892 } 893 else if (Optimizations.FastFP) 894 { 895 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 896 { 897 return context.Subtract(op1, context.Multiply(op2, op3)); 898 }); 899 } 900 else 901 { 902 EmitScalarTernaryOpF32(context, (op1, op2, op3) => 903 { 904 Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op2, op3); 905 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, res); 906 }); 907 } 908 } 909 910 public static void Vmls_V(ArmEmitterContext context) 911 { 912 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 913 { 914 InstEmitSimdHelper32Arm64.EmitVectorTernaryOpF32(context, Intrinsic.Arm64FmlsV); 915 } 916 else if (Optimizations.FastFP && Optimizations.UseSse2) 917 { 918 EmitVectorTernaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); 919 } 920 else if (Optimizations.FastFP) 921 { 922 EmitVectorTernaryOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); 923 } 924 else 925 { 926 EmitVectorTernaryOpF32(context, (op1, op2, op3) => 927 { 928 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3); 929 }); 930 } 931 } 932 933 public static void Vmls_I(ArmEmitterContext context) 934 { 935 EmitVectorTernaryOpZx32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); 936 } 937 938 public static void Vmls_1(ArmEmitterContext context) 939 { 940 OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; 941 942 if (op.F) 943 { 944 if (Optimizations.FastFP && Optimizations.UseSse2) 945 { 946 EmitVectorsByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd, Intrinsic.X86Subps, Intrinsic.X86Subpd); 947 } 948 else if (Optimizations.FastFP) 949 { 950 EmitVectorsByScalarOpF32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3))); 951 } 952 else 953 { 954 EmitVectorsByScalarOpF32(context, (op1, op2, op3) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulSubFpscr), op1, op2, op3)); 955 } 956 } 957 else 958 { 959 EmitVectorsByScalarOpI32(context, (op1, op2, op3) => context.Subtract(op1, context.Multiply(op2, op3)), false); 960 } 961 } 962 963 public static void Vmlsl_I(ArmEmitterContext context) 964 { 965 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 966 967 EmitVectorTernaryLongOpI32(context, (opD, op1, op2) => context.Subtract(opD, context.Multiply(op1, op2)), !op.U); 968 } 969 970 public static void Vmul_S(ArmEmitterContext context) 971 { 972 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 973 { 974 InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FmulS); 975 } 976 else if (Optimizations.FastFP && Optimizations.UseSse2) 977 { 978 EmitScalarBinaryOpF32(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd); 979 } 980 else if (Optimizations.FastFP) 981 { 982 EmitScalarBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2)); 983 } 984 else 985 { 986 EmitScalarBinaryOpF32(context, (op1, op2) => 987 { 988 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2); 989 }); 990 } 991 } 992 993 public static void Vmul_V(ArmEmitterContext context) 994 { 995 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 996 { 997 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FmulV); 998 } 999 else if (Optimizations.FastFP && Optimizations.UseSse2) 1000 { 1001 EmitVectorBinaryOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); 1002 } 1003 else if (Optimizations.FastFP) 1004 { 1005 EmitVectorBinaryOpF32(context, (op1, op2) => context.Multiply(op1, op2)); 1006 } 1007 else 1008 { 1009 EmitVectorBinaryOpF32(context, (op1, op2) => 1010 { 1011 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulFpscr), op1, op2); 1012 }); 1013 } 1014 } 1015 1016 public static void Vmul_I(ArmEmitterContext context) 1017 { 1018 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1019 1020 if (op.U) // This instruction is always signed, U indicates polynomial mode. 1021 { 1022 EmitVectorBinaryOpZx32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size)); 1023 } 1024 else 1025 { 1026 EmitVectorBinaryOpSx32(context, (op1, op2) => context.Multiply(op1, op2)); 1027 } 1028 } 1029 1030 public static void Vmul_1(ArmEmitterContext context) 1031 { 1032 OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; 1033 1034 if (op.F) 1035 { 1036 if (Optimizations.FastFP && Optimizations.UseSse2) 1037 { 1038 EmitVectorByScalarOpF32(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd); 1039 } 1040 else if (Optimizations.FastFP) 1041 { 1042 EmitVectorByScalarOpF32(context, (op1, op2) => context.Multiply(op1, op2)); 1043 } 1044 else 1045 { 1046 EmitVectorByScalarOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMulFpscr), op1, op2)); 1047 } 1048 } 1049 else 1050 { 1051 EmitVectorByScalarOpI32(context, (op1, op2) => context.Multiply(op1, op2), false); 1052 } 1053 } 1054 1055 public static void Vmull_1(ArmEmitterContext context) 1056 { 1057 OpCode32SimdRegElem op = (OpCode32SimdRegElem)context.CurrOp; 1058 1059 EmitVectorByScalarLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U); 1060 } 1061 1062 public static void Vmull_I(ArmEmitterContext context) 1063 { 1064 OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; 1065 1066 if (op.Polynomial) 1067 { 1068 if (op.Size == 0) // P8 1069 { 1070 EmitVectorBinaryLongOpI32(context, (op1, op2) => EmitPolynomialMultiply(context, op1, op2, 8 << op.Size), false); 1071 } 1072 else /* if (op.Size == 2) // P64 */ 1073 { 1074 Operand ne = context.VectorExtract(OperandType.I64, GetVec(op.Qn), op.Vn & 1); 1075 Operand me = context.VectorExtract(OperandType.I64, GetVec(op.Qm), op.Vm & 1); 1076 1077 Operand res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me); 1078 1079 context.Copy(GetVecA32(op.Qd), res); 1080 } 1081 } 1082 else 1083 { 1084 EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Multiply(op1, op2), !op.U); 1085 } 1086 } 1087 1088 public static void Vpadd_V(ArmEmitterContext context) 1089 { 1090 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1091 { 1092 InstEmitSimdHelper32Arm64.EmitVectorPairwiseOpF32(context, Intrinsic.Arm64FaddpV); 1093 } 1094 else if (Optimizations.FastFP && Optimizations.UseSse2) 1095 { 1096 EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Addps); 1097 } 1098 else 1099 { 1100 EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPAddFpscr), op1, op2)); 1101 } 1102 } 1103 1104 public static void Vpadd_I(ArmEmitterContext context) 1105 { 1106 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1107 1108 if (Optimizations.UseSsse3) 1109 { 1110 EmitSsse3VectorPairwiseOp32(context, X86PaddInstruction); 1111 } 1112 else 1113 { 1114 EmitVectorPairwiseOpI32(context, (op1, op2) => context.Add(op1, op2), !op.U); 1115 } 1116 } 1117 1118 public static void Vpadal(ArmEmitterContext context) 1119 { 1120 OpCode32Simd op = (OpCode32Simd)context.CurrOp; 1121 1122 EmitVectorPairwiseTernaryLongOpI32(context, (op1, op2, op3) => context.Add(context.Add(op1, op2), op3), op.Opc != 1); 1123 } 1124 1125 public static void Vpaddl(ArmEmitterContext context) 1126 { 1127 OpCode32Simd op = (OpCode32Simd)context.CurrOp; 1128 1129 EmitVectorPairwiseLongOpI32(context, (op1, op2) => context.Add(op1, op2), (op.Opc & 1) == 0); 1130 } 1131 1132 public static void Vpmax_V(ArmEmitterContext context) 1133 { 1134 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1135 { 1136 InstEmitSimdHelper32Arm64.EmitVectorPairwiseOpF32(context, Intrinsic.Arm64FmaxpV); 1137 } 1138 else if (Optimizations.FastFP && Optimizations.UseSse2) 1139 { 1140 EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Maxps); 1141 } 1142 else 1143 { 1144 EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat64.FPMaxFpscr), op1, op2)); 1145 } 1146 } 1147 1148 public static void Vpmax_I(ArmEmitterContext context) 1149 { 1150 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1151 1152 if (Optimizations.UseSsse3) 1153 { 1154 EmitSsse3VectorPairwiseOp32(context, op.U ? X86PmaxuInstruction : X86PmaxsInstruction); 1155 } 1156 else 1157 { 1158 EmitVectorPairwiseOpI32(context, (op1, op2) => 1159 { 1160 Operand greater = op.U ? context.ICompareGreaterUI(op1, op2) : context.ICompareGreater(op1, op2); 1161 return context.ConditionalSelect(greater, op1, op2); 1162 }, !op.U); 1163 } 1164 } 1165 1166 public static void Vpmin_V(ArmEmitterContext context) 1167 { 1168 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1169 { 1170 InstEmitSimdHelper32Arm64.EmitVectorPairwiseOpF32(context, Intrinsic.Arm64FminpV); 1171 } 1172 else if (Optimizations.FastFP && Optimizations.UseSse2) 1173 { 1174 EmitSse2VectorPairwiseOpF32(context, Intrinsic.X86Minps); 1175 } 1176 else 1177 { 1178 EmitVectorPairwiseOpF32(context, (op1, op2) => EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPMinFpscr), op1, op2)); 1179 } 1180 } 1181 1182 public static void Vpmin_I(ArmEmitterContext context) 1183 { 1184 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1185 1186 if (Optimizations.UseSsse3) 1187 { 1188 EmitSsse3VectorPairwiseOp32(context, op.U ? X86PminuInstruction : X86PminsInstruction); 1189 } 1190 else 1191 { 1192 EmitVectorPairwiseOpI32(context, (op1, op2) => 1193 { 1194 Operand greater = op.U ? context.ICompareLessUI(op1, op2) : context.ICompareLess(op1, op2); 1195 return context.ConditionalSelect(greater, op1, op2); 1196 }, !op.U); 1197 } 1198 } 1199 1200 public static void Vqadd(ArmEmitterContext context) 1201 { 1202 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1203 1204 EmitSaturatingAddSubBinaryOp(context, add: true, !op.U); 1205 } 1206 1207 public static void Vqdmulh(ArmEmitterContext context) 1208 { 1209 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1210 int eSize = 8 << op.Size; 1211 1212 EmitVectorBinaryOpI32(context, (op1, op2) => 1213 { 1214 if (op.Size == 2) 1215 { 1216 op1 = context.SignExtend32(OperandType.I64, op1); 1217 op2 = context.SignExtend32(OperandType.I64, op2); 1218 } 1219 1220 Operand res = context.Multiply(op1, op2); 1221 res = context.ShiftRightSI(res, Const(eSize - 1)); 1222 res = EmitSatQ(context, res, eSize, signedSrc: true, signedDst: true); 1223 1224 if (op.Size == 2) 1225 { 1226 res = context.ConvertI64ToI32(res); 1227 } 1228 1229 return res; 1230 }, signed: true); 1231 } 1232 1233 public static void Vqmovn(ArmEmitterContext context) 1234 { 1235 OpCode32SimdMovn op = (OpCode32SimdMovn)context.CurrOp; 1236 1237 bool signed = !op.Q; 1238 1239 EmitVectorUnaryNarrowOp32(context, (op1) => EmitSatQ(context, op1, 8 << op.Size, signed, signed), signed); 1240 } 1241 1242 public static void Vqmovun(ArmEmitterContext context) 1243 { 1244 OpCode32SimdMovn op = (OpCode32SimdMovn)context.CurrOp; 1245 1246 EmitVectorUnaryNarrowOp32(context, (op1) => EmitSatQ(context, op1, 8 << op.Size, signedSrc: true, signedDst: false), signed: true); 1247 } 1248 1249 public static void Vqrdmulh(ArmEmitterContext context) 1250 { 1251 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1252 int eSize = 8 << op.Size; 1253 1254 EmitVectorBinaryOpI32(context, (op1, op2) => 1255 { 1256 if (op.Size == 2) 1257 { 1258 op1 = context.SignExtend32(OperandType.I64, op1); 1259 op2 = context.SignExtend32(OperandType.I64, op2); 1260 } 1261 1262 Operand res = context.Multiply(op1, op2); 1263 res = context.Add(res, Const(res.Type, 1L << (eSize - 2))); 1264 res = context.ShiftRightSI(res, Const(eSize - 1)); 1265 res = EmitSatQ(context, res, eSize, signedSrc: true, signedDst: true); 1266 1267 if (op.Size == 2) 1268 { 1269 res = context.ConvertI64ToI32(res); 1270 } 1271 1272 return res; 1273 }, signed: true); 1274 } 1275 1276 public static void Vqsub(ArmEmitterContext context) 1277 { 1278 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1279 1280 EmitSaturatingAddSubBinaryOp(context, add: false, !op.U); 1281 } 1282 1283 public static void Vrev(ArmEmitterContext context) 1284 { 1285 OpCode32SimdRev op = (OpCode32SimdRev)context.CurrOp; 1286 1287 if (Optimizations.UseSsse3) 1288 { 1289 EmitVectorUnaryOpSimd32(context, (op1) => 1290 { 1291 Operand mask; 1292 switch (op.Size) 1293 { 1294 case 3: 1295 // Rev64 1296 switch (op.Opc) 1297 { 1298 case 0: 1299 mask = X86GetElements(context, 0x08090a0b0c0d0e0fL, 0x0001020304050607L); 1300 return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); 1301 case 1: 1302 mask = X86GetElements(context, 0x09080b0a0d0c0f0eL, 0x0100030205040706L); 1303 return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); 1304 case 2: 1305 return context.AddIntrinsic(Intrinsic.X86Shufps, op1, op1, Const(1 | (0 << 2) | (3 << 4) | (2 << 6))); 1306 } 1307 break; 1308 case 2: 1309 // Rev32 1310 switch (op.Opc) 1311 { 1312 case 0: 1313 mask = X86GetElements(context, 0x0c0d0e0f_08090a0bL, 0x04050607_00010203L); 1314 return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); 1315 case 1: 1316 mask = X86GetElements(context, 0x0d0c0f0e_09080b0aL, 0x05040706_01000302L); 1317 return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); 1318 } 1319 break; 1320 case 1: 1321 // Rev16 1322 mask = X86GetElements(context, 0x0e0f_0c0d_0a0b_0809L, 0x_0607_0405_0203_0001L); 1323 return context.AddIntrinsic(Intrinsic.X86Pshufb, op1, mask); 1324 } 1325 1326 throw new InvalidOperationException("Invalid VREV Opcode + Size combo."); // Should be unreachable. 1327 }); 1328 } 1329 else 1330 { 1331 EmitVectorUnaryOpZx32(context, (op1) => 1332 { 1333 switch (op.Opc) 1334 { 1335 case 0: 1336 switch (op.Size) // Swap bytes. 1337 { 1338 case 1: 1339 return InstEmitAluHelper.EmitReverseBytes16_32Op(context, op1); 1340 case 2: 1341 case 3: 1342 return context.ByteSwap(op1); 1343 } 1344 break; 1345 case 1: 1346 switch (op.Size) 1347 { 1348 case 2: 1349 return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff0000)), Const(16)), 1350 context.ShiftLeft(context.BitwiseAnd(op1, Const(0x0000ffff)), Const(16))); 1351 case 3: 1352 return context.BitwiseOr( 1353 context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffff000000000000ul)), Const(48)), 1354 context.ShiftLeft(context.BitwiseAnd(op1, Const(0x000000000000fffful)), Const(48))), 1355 context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0x0000ffff00000000ul)), Const(16)), 1356 context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000ffff0000ul)), Const(16)))); 1357 } 1358 break; 1359 case 2: 1360 // Swap upper and lower halves. 1361 return context.BitwiseOr(context.ShiftRightUI(context.BitwiseAnd(op1, Const(0xffffffff00000000ul)), Const(32)), 1362 context.ShiftLeft(context.BitwiseAnd(op1, Const(0x00000000fffffffful)), Const(32))); 1363 } 1364 1365 throw new InvalidOperationException("Invalid VREV Opcode + Size combo."); // Should be unreachable. 1366 }); 1367 } 1368 } 1369 1370 public static void Vrecpe(ArmEmitterContext context) 1371 { 1372 OpCode32SimdSqrte op = (OpCode32SimdSqrte)context.CurrOp; 1373 1374 if (op.F) 1375 { 1376 int sizeF = op.Size & 1; 1377 1378 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1379 { 1380 InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FrecpeV); 1381 } 1382 else if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0) 1383 { 1384 EmitVectorUnaryOpF32(context, Intrinsic.X86Rcpps, 0); 1385 } 1386 else 1387 { 1388 EmitVectorUnaryOpF32(context, (op1) => 1389 { 1390 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPRecipEstimateFpscr), op1); 1391 }); 1392 } 1393 } 1394 else 1395 { 1396 throw new NotImplementedException("Integer Vrecpe not currently implemented."); 1397 } 1398 } 1399 1400 public static void Vrecps(ArmEmitterContext context) 1401 { 1402 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1403 { 1404 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FrecpsV); 1405 } 1406 else if (Optimizations.FastFP && Optimizations.UseSse2) 1407 { 1408 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1409 bool single = (op.Size & 1) == 0; 1410 1411 // (2 - (n*m)) 1412 EmitVectorBinaryOpSimd32(context, (n, m) => 1413 { 1414 if (single) 1415 { 1416 Operand maskTwo = X86GetAllElements(context, 2f); 1417 1418 Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); 1419 1420 return context.AddIntrinsic(Intrinsic.X86Subps, maskTwo, res); 1421 } 1422 else 1423 { 1424 Operand maskTwo = X86GetAllElements(context, 2d); 1425 1426 Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); 1427 1428 return context.AddIntrinsic(Intrinsic.X86Subpd, maskTwo, res); 1429 } 1430 }); 1431 } 1432 else 1433 { 1434 EmitVectorBinaryOpF32(context, (op1, op2) => 1435 { 1436 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStep), op1, op2); 1437 }); 1438 } 1439 } 1440 1441 public static void Vrhadd(ArmEmitterContext context) 1442 { 1443 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1444 1445 EmitVectorBinaryOpI32(context, (op1, op2) => 1446 { 1447 if (op.Size == 2) 1448 { 1449 op1 = context.ZeroExtend32(OperandType.I64, op1); 1450 op2 = context.ZeroExtend32(OperandType.I64, op2); 1451 } 1452 1453 Operand res = context.Add(context.Add(op1, op2), Const(op1.Type, 1L)); 1454 res = context.ShiftRightUI(res, Const(1)); 1455 1456 if (op.Size == 2) 1457 { 1458 res = context.ConvertI64ToI32(res); 1459 } 1460 1461 return res; 1462 }, !op.U); 1463 } 1464 1465 public static void Vrsqrte(ArmEmitterContext context) 1466 { 1467 OpCode32SimdSqrte op = (OpCode32SimdSqrte)context.CurrOp; 1468 1469 if (op.F) 1470 { 1471 int sizeF = op.Size & 1; 1472 1473 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1474 { 1475 InstEmitSimdHelper32Arm64.EmitVectorUnaryOpF32(context, Intrinsic.Arm64FrsqrteV); 1476 } 1477 else if (Optimizations.FastFP && Optimizations.UseSse2 && sizeF == 0) 1478 { 1479 EmitVectorUnaryOpF32(context, Intrinsic.X86Rsqrtps, 0); 1480 } 1481 else 1482 { 1483 EmitVectorUnaryOpF32(context, (op1) => 1484 { 1485 return EmitSoftFloatCallDefaultFpscr(context, nameof(SoftFloat32.FPRSqrtEstimateFpscr), op1); 1486 }); 1487 } 1488 } 1489 else 1490 { 1491 throw new NotImplementedException("Integer Vrsqrte not currently implemented."); 1492 } 1493 } 1494 1495 public static void Vrsqrts(ArmEmitterContext context) 1496 { 1497 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1498 { 1499 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FrsqrtsV); 1500 } 1501 else if (Optimizations.FastFP && Optimizations.UseSse2) 1502 { 1503 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1504 bool single = (op.Size & 1) == 0; 1505 1506 // (3 - (n*m)) / 2 1507 EmitVectorBinaryOpSimd32(context, (n, m) => 1508 { 1509 if (single) 1510 { 1511 Operand maskHalf = X86GetAllElements(context, 0.5f); 1512 Operand maskThree = X86GetAllElements(context, 3f); 1513 1514 Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); 1515 1516 res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); 1517 return context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res); 1518 } 1519 else 1520 { 1521 Operand maskHalf = X86GetAllElements(context, 0.5d); 1522 Operand maskThree = X86GetAllElements(context, 3d); 1523 1524 Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); 1525 1526 res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); 1527 return context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res); 1528 } 1529 }); 1530 } 1531 else 1532 { 1533 EmitVectorBinaryOpF32(context, (op1, op2) => 1534 { 1535 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtStep), op1, op2); 1536 }); 1537 } 1538 } 1539 1540 public static void Vsel(ArmEmitterContext context) 1541 { 1542 OpCode32SimdSel op = (OpCode32SimdSel)context.CurrOp; 1543 1544 Operand condition = default; 1545 1546 switch (op.Cc) 1547 { 1548 case OpCode32SimdSelMode.Eq: 1549 condition = GetCondTrue(context, Condition.Eq); 1550 break; 1551 case OpCode32SimdSelMode.Ge: 1552 condition = GetCondTrue(context, Condition.Ge); 1553 break; 1554 case OpCode32SimdSelMode.Gt: 1555 condition = GetCondTrue(context, Condition.Gt); 1556 break; 1557 case OpCode32SimdSelMode.Vs: 1558 condition = GetCondTrue(context, Condition.Vs); 1559 break; 1560 } 1561 1562 EmitScalarBinaryOpI32(context, (op1, op2) => 1563 { 1564 return context.ConditionalSelect(condition, op1, op2); 1565 }); 1566 } 1567 1568 public static void Vsqrt_S(ArmEmitterContext context) 1569 { 1570 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1571 { 1572 InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FsqrtS); 1573 } 1574 else if (Optimizations.FastFP && Optimizations.UseSse2) 1575 { 1576 EmitScalarUnaryOpF32(context, Intrinsic.X86Sqrtss, Intrinsic.X86Sqrtsd); 1577 } 1578 else 1579 { 1580 EmitScalarUnaryOpF32(context, (op1) => 1581 { 1582 return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt), op1); 1583 }); 1584 } 1585 } 1586 1587 public static void Vsub_S(ArmEmitterContext context) 1588 { 1589 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1590 { 1591 InstEmitSimdHelper32Arm64.EmitScalarBinaryOpF32(context, Intrinsic.Arm64FsubS); 1592 } 1593 else if (Optimizations.FastFP && Optimizations.UseSse2) 1594 { 1595 EmitScalarBinaryOpF32(context, Intrinsic.X86Subss, Intrinsic.X86Subsd); 1596 } 1597 else 1598 { 1599 EmitScalarBinaryOpF32(context, (op1, op2) => context.Subtract(op1, op2)); 1600 } 1601 } 1602 1603 public static void Vsub_V(ArmEmitterContext context) 1604 { 1605 if (Optimizations.FastFP && Optimizations.UseAdvSimd) 1606 { 1607 InstEmitSimdHelper32Arm64.EmitVectorBinaryOpF32(context, Intrinsic.Arm64FsubV); 1608 } 1609 else if (Optimizations.FastFP && Optimizations.UseSse2) 1610 { 1611 EmitVectorBinaryOpF32(context, Intrinsic.X86Subps, Intrinsic.X86Subpd); 1612 } 1613 else 1614 { 1615 EmitVectorBinaryOpF32(context, (op1, op2) => context.Subtract(op1, op2)); 1616 } 1617 } 1618 1619 public static void Vsub_I(ArmEmitterContext context) 1620 { 1621 if (Optimizations.UseSse2) 1622 { 1623 OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp; 1624 EmitVectorBinaryOpSimd32(context, (op1, op2) => context.AddIntrinsic(X86PsubInstruction[op.Size], op1, op2)); 1625 } 1626 else 1627 { 1628 EmitVectorBinaryOpZx32(context, (op1, op2) => context.Subtract(op1, op2)); 1629 } 1630 } 1631 1632 public static void Vsubl_I(ArmEmitterContext context) 1633 { 1634 OpCode32SimdRegLong op = (OpCode32SimdRegLong)context.CurrOp; 1635 1636 EmitVectorBinaryLongOpI32(context, (op1, op2) => context.Subtract(op1, op2), !op.U); 1637 } 1638 1639 public static void Vsubw_I(ArmEmitterContext context) 1640 { 1641 OpCode32SimdRegWide op = (OpCode32SimdRegWide)context.CurrOp; 1642 1643 EmitVectorBinaryWideOpI32(context, (op1, op2) => context.Subtract(op1, op2), !op.U); 1644 } 1645 1646 private static void EmitSaturatingAddSubBinaryOp(ArmEmitterContext context, bool add, bool signed) 1647 { 1648 OpCode32Simd op = (OpCode32Simd)context.CurrOp; 1649 1650 EmitVectorBinaryOpI32(context, (ne, me) => 1651 { 1652 if (op.Size <= 2) 1653 { 1654 if (op.Size == 2) 1655 { 1656 ne = signed ? context.SignExtend32(OperandType.I64, ne) : context.ZeroExtend32(OperandType.I64, ne); 1657 me = signed ? context.SignExtend32(OperandType.I64, me) : context.ZeroExtend32(OperandType.I64, me); 1658 } 1659 1660 Operand res = add ? context.Add(ne, me) : context.Subtract(ne, me); 1661 1662 res = EmitSatQ(context, res, 8 << op.Size, signedSrc: true, signed); 1663 1664 if (op.Size == 2) 1665 { 1666 res = context.ConvertI64ToI32(res); 1667 } 1668 1669 return res; 1670 } 1671 else if (add) /* if (op.Size == 3) */ 1672 { 1673 return signed 1674 ? EmitBinarySignedSatQAdd(context, ne, me) 1675 : EmitBinaryUnsignedSatQAdd(context, ne, me); 1676 } 1677 else /* if (sub) */ 1678 { 1679 return signed 1680 ? EmitBinarySignedSatQSub(context, ne, me) 1681 : EmitBinaryUnsignedSatQSub(context, ne, me); 1682 } 1683 }, signed); 1684 } 1685 1686 private static void EmitSse41MaxMinNumOpF32(ArmEmitterContext context, bool isMaxNum, bool scalar) 1687 { 1688 IOpCode32Simd op = (IOpCode32Simd)context.CurrOp; 1689 1690 Operand genericEmit(Operand n, Operand m) 1691 { 1692 Operand nNum = context.Copy(n); 1693 Operand mNum = context.Copy(m); 1694 1695 InstEmit.EmitSse2VectorIsNaNOpF(context, nNum, out Operand nQNaNMask, out _, isQNaN: true); 1696 InstEmit.EmitSse2VectorIsNaNOpF(context, mNum, out Operand mQNaNMask, out _, isQNaN: true); 1697 1698 int sizeF = op.Size & 1; 1699 1700 if (sizeF == 0) 1701 { 1702 Operand negInfMask = X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity); 1703 1704 Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnps, mQNaNMask, nQNaNMask); 1705 Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnps, nQNaNMask, mQNaNMask); 1706 1707 nNum = context.AddIntrinsic(Intrinsic.X86Blendvps, nNum, negInfMask, nMask); 1708 mNum = context.AddIntrinsic(Intrinsic.X86Blendvps, mNum, negInfMask, mMask); 1709 1710 return context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxps : Intrinsic.X86Minps, nNum, mNum); 1711 } 1712 else /* if (sizeF == 1) */ 1713 { 1714 Operand negInfMask = X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity); 1715 1716 Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnpd, mQNaNMask, nQNaNMask); 1717 Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnpd, nQNaNMask, mQNaNMask); 1718 1719 nNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, nNum, negInfMask, nMask); 1720 mNum = context.AddIntrinsic(Intrinsic.X86Blendvpd, mNum, negInfMask, mMask); 1721 1722 return context.AddIntrinsic(isMaxNum ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, nNum, mNum); 1723 } 1724 } 1725 1726 if (scalar) 1727 { 1728 EmitScalarBinaryOpSimd32(context, genericEmit); 1729 } 1730 else 1731 { 1732 EmitVectorBinaryOpSimd32(context, genericEmit); 1733 } 1734 } 1735 } 1736 }