/ src / ARMeilleure / Instructions / InstEmitSimdArithmetic.cs
InstEmitSimdArithmetic.cs
   1  // https://github.com/intel/ARM_NEON_2_x86_SSE/blob/master/NEON_2_SSE.h
   2  // https://www.agner.org/optimize/#vectorclass @ vectori128.h
   3  
   4  using ARMeilleure.Decoders;
   5  using ARMeilleure.IntermediateRepresentation;
   6  using ARMeilleure.State;
   7  using ARMeilleure.Translation;
   8  using System;
   9  using System.Diagnostics;
  10  using static ARMeilleure.Instructions.InstEmitHelper;
  11  using static ARMeilleure.Instructions.InstEmitSimdHelper;
  12  using static ARMeilleure.Instructions.InstEmitSimdHelper32;
  13  using static ARMeilleure.IntermediateRepresentation.Operand.Factory;
  14  
  15  namespace ARMeilleure.Instructions
  16  {
  17      using Func2I = Func<Operand, Operand, Operand>;
  18  
  19      static partial class InstEmit
  20      {
  21          public static void Abs_S(ArmEmitterContext context)
  22          {
  23              if (Optimizations.UseAdvSimd)
  24              {
  25                  InstEmitSimdHelperArm64.EmitScalarUnaryOp(context, Intrinsic.Arm64AbsS);
  26              }
  27              else
  28              {
  29                  EmitScalarUnaryOpSx(context, (op1) => EmitAbs(context, op1));
  30              }
  31          }
  32  
  33          public static void Abs_V(ArmEmitterContext context)
  34          {
  35              if (Optimizations.UseAdvSimd)
  36              {
  37                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64AbsV);
  38              }
  39              else
  40              {
  41                  EmitVectorUnaryOpSx(context, (op1) => EmitAbs(context, op1));
  42              }
  43          }
  44  
  45          public static void Add_S(ArmEmitterContext context)
  46          {
  47              if (Optimizations.UseAdvSimd)
  48              {
  49                  InstEmitSimdHelperArm64.EmitScalarBinaryOp(context, Intrinsic.Arm64AddS);
  50              }
  51              else
  52              {
  53                  EmitScalarBinaryOpZx(context, (op1, op2) => context.Add(op1, op2));
  54              }
  55          }
  56  
  57          public static void Add_V(ArmEmitterContext context)
  58          {
  59              if (Optimizations.UseAdvSimd)
  60              {
  61                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64AddV);
  62              }
  63              else if (Optimizations.UseSse2)
  64              {
  65                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
  66  
  67                  Operand n = GetVec(op.Rn);
  68                  Operand m = GetVec(op.Rm);
  69  
  70                  Intrinsic addInst = X86PaddInstruction[op.Size];
  71  
  72                  Operand res = context.AddIntrinsic(addInst, n, m);
  73  
  74                  if (op.RegisterSize == RegisterSize.Simd64)
  75                  {
  76                      res = context.VectorZeroUpper64(res);
  77                  }
  78  
  79                  context.Copy(GetVec(op.Rd), res);
  80              }
  81              else
  82              {
  83                  EmitVectorBinaryOpZx(context, (op1, op2) => context.Add(op1, op2));
  84              }
  85          }
  86  
  87          public static void Addhn_V(ArmEmitterContext context)
  88          {
  89              if (Optimizations.UseAdvSimd)
  90              {
  91                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64AddhnV);
  92              }
  93              else
  94              {
  95                  EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: false);
  96              }
  97          }
  98  
  99          public static void Addp_S(ArmEmitterContext context)
 100          {
 101              if (Optimizations.UseAdvSimd)
 102              {
 103                  InstEmitSimdHelperArm64.EmitScalarUnaryOp(context, Intrinsic.Arm64AddpS);
 104              }
 105              else
 106              {
 107                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 108  
 109                  Operand ne0 = EmitVectorExtractZx(context, op.Rn, 0, op.Size);
 110                  Operand ne1 = EmitVectorExtractZx(context, op.Rn, 1, op.Size);
 111  
 112                  Operand res = context.Add(ne0, ne1);
 113  
 114                  context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, op.Size));
 115              }
 116          }
 117  
 118          public static void Addp_V(ArmEmitterContext context)
 119          {
 120              if (Optimizations.UseAdvSimd)
 121              {
 122                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64AddpV);
 123              }
 124              else if (Optimizations.UseSsse3)
 125              {
 126                  EmitSsse3VectorPairwiseOp(context, X86PaddInstruction);
 127              }
 128              else
 129              {
 130                  EmitVectorPairwiseOpZx(context, (op1, op2) => context.Add(op1, op2));
 131              }
 132          }
 133  
 134          public static void Addv_V(ArmEmitterContext context)
 135          {
 136              if (Optimizations.UseAdvSimd)
 137              {
 138                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64AddvV);
 139              }
 140              else
 141              {
 142                  EmitVectorAcrossVectorOpZx(context, (op1, op2) => context.Add(op1, op2));
 143              }
 144          }
 145  
 146          public static void Cls_V(ArmEmitterContext context)
 147          {
 148              if (Optimizations.UseAdvSimd)
 149              {
 150                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64ClsV);
 151              }
 152              else
 153              {
 154                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 155  
 156                  Operand res = context.VectorZero();
 157  
 158                  int elems = op.GetBytesCount() >> op.Size;
 159  
 160                  int eSize = 8 << op.Size;
 161  
 162                  for (int index = 0; index < elems; index++)
 163                  {
 164                      Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
 165  
 166                      Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingSigns)), ne, Const(eSize));
 167  
 168                      res = EmitVectorInsert(context, res, de, index, op.Size);
 169                  }
 170  
 171                  context.Copy(GetVec(op.Rd), res);
 172              }
 173          }
 174  
 175          public static void Clz_V(ArmEmitterContext context)
 176          {
 177              if (Optimizations.UseAdvSimd)
 178              {
 179                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64ClzV);
 180              }
 181              else
 182              {
 183                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 184  
 185                  int eSize = 8 << op.Size;
 186  
 187                  Operand res = eSize switch
 188                  {
 189                      8 => Clz_V_I8(context, GetVec(op.Rn)),
 190                      16 => Clz_V_I16(context, GetVec(op.Rn)),
 191                      32 => Clz_V_I32(context, GetVec(op.Rn)),
 192                      _ => default,
 193                  };
 194  
 195                  if (res != default)
 196                  {
 197                      if (op.RegisterSize == RegisterSize.Simd64)
 198                      {
 199                          res = context.VectorZeroUpper64(res);
 200                      }
 201                  }
 202                  else
 203                  {
 204                      int elems = op.GetBytesCount() >> op.Size;
 205  
 206                      res = context.VectorZero();
 207  
 208                      for (int index = 0; index < elems; index++)
 209                      {
 210                          Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
 211  
 212                          Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
 213  
 214                          res = EmitVectorInsert(context, res, de, index, op.Size);
 215                      }
 216                  }
 217  
 218                  context.Copy(GetVec(op.Rd), res);
 219              }
 220          }
 221  
 222          private static Operand Clz_V_I8(ArmEmitterContext context, Operand arg)
 223          {
 224              if (!Optimizations.UseSsse3)
 225              {
 226                  return default;
 227              }
 228  
 229              // CLZ nibble table.
 230              Operand clzTable = X86GetScalar(context, 0x01_01_01_01_02_02_03_04);
 231  
 232              Operand maskLow = X86GetAllElements(context, 0x0f_0f_0f_0f);
 233              Operand c04 = X86GetAllElements(context, 0x04_04_04_04);
 234  
 235              // CLZ of low 4 bits of elements in arg.
 236              Operand loClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, arg);
 237  
 238              // Get the high 4 bits of elements in arg.
 239              Operand hiArg = context.AddIntrinsic(Intrinsic.X86Psrlw, arg, Const(4));
 240              hiArg = context.AddIntrinsic(Intrinsic.X86Pand, hiArg, maskLow);
 241  
 242              // CLZ of high 4 bits of elements in arg.
 243              Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, hiArg);
 244  
 245              // If high 4 bits are not all zero, we discard the CLZ of the low 4 bits.
 246              Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqb, hiClz, c04);
 247              loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
 248  
 249              return context.AddIntrinsic(Intrinsic.X86Paddb, loClz, hiClz);
 250          }
 251  
 252          private static Operand Clz_V_I16(ArmEmitterContext context, Operand arg)
 253          {
 254              if (!Optimizations.UseSsse3)
 255              {
 256                  return default;
 257              }
 258  
 259              Operand maskSwap = X86GetElements(context, 0x80_0f_80_0d_80_0b_80_09, 0x80_07_80_05_80_03_80_01);
 260              Operand maskLow = X86GetAllElements(context, 0x00ff_00ff);
 261              Operand c0008 = X86GetAllElements(context, 0x0008_0008);
 262  
 263              // CLZ pair of high 8 and low 8 bits of elements in arg.
 264              Operand hiloClz = Clz_V_I8(context, arg);
 265              // Get CLZ of low 8 bits in each pair.
 266              Operand loClz = context.AddIntrinsic(Intrinsic.X86Pand, hiloClz, maskLow);
 267              // Get CLZ of high 8 bits in each pair.
 268              Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, hiloClz, maskSwap);
 269  
 270              // If high 8 bits are not all zero, we discard the CLZ of the low 8 bits.
 271              Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, hiClz, c0008);
 272              loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
 273  
 274              return context.AddIntrinsic(Intrinsic.X86Paddw, loClz, hiClz);
 275          }
 276  
 277          private static Operand Clz_V_I32(ArmEmitterContext context, Operand arg)
 278          {
 279              // TODO: Use vplzcntd when AVX-512 is supported.
 280              if (!Optimizations.UseSse2)
 281              {
 282                  return default;
 283              }
 284  
 285  #pragma warning disable IDE0055 // Disable formatting
 286              Operand AddVectorI32(Operand op0, Operand op1)      => context.AddIntrinsic(Intrinsic.X86Paddd, op0, op1);
 287              Operand SubVectorI32(Operand op0, Operand op1)      => context.AddIntrinsic(Intrinsic.X86Psubd, op0, op1);
 288              Operand ShiftRightVectorUI32(Operand op0, int imm8) => context.AddIntrinsic(Intrinsic.X86Psrld, op0, Const(imm8));
 289              Operand OrVector(Operand op0, Operand op1)          => context.AddIntrinsic(Intrinsic.X86Por, op0, op1);
 290              Operand AndVector(Operand op0, Operand op1)         => context.AddIntrinsic(Intrinsic.X86Pand, op0, op1);
 291              Operand NotVector(Operand op0)                      => context.AddIntrinsic(Intrinsic.X86Pandn, op0, context.VectorOne());
 292  #pragma warning restore IDE0055
 293  
 294              Operand c55555555 = X86GetAllElements(context, 0x55555555);
 295              Operand c33333333 = X86GetAllElements(context, 0x33333333);
 296              Operand c0f0f0f0f = X86GetAllElements(context, 0x0f0f0f0f);
 297              Operand c0000003f = X86GetAllElements(context, 0x0000003f);
 298  
 299              Operand tmp0;
 300              Operand tmp1;
 301              Operand res;
 302  
 303              // Set all bits after highest set bit to 1.
 304              res = OrVector(ShiftRightVectorUI32(arg, 1), arg);
 305              res = OrVector(ShiftRightVectorUI32(res, 2), res);
 306              res = OrVector(ShiftRightVectorUI32(res, 4), res);
 307              res = OrVector(ShiftRightVectorUI32(res, 8), res);
 308              res = OrVector(ShiftRightVectorUI32(res, 16), res);
 309  
 310              // Make leading 0s into leading 1s.
 311              res = NotVector(res);
 312  
 313              // Count leading 1s, which is the population count.
 314              tmp0 = ShiftRightVectorUI32(res, 1);
 315              tmp0 = AndVector(tmp0, c55555555);
 316              res = SubVectorI32(res, tmp0);
 317  
 318              tmp0 = ShiftRightVectorUI32(res, 2);
 319              tmp0 = AndVector(tmp0, c33333333);
 320              tmp1 = AndVector(res, c33333333);
 321              res = AddVectorI32(tmp0, tmp1);
 322  
 323              tmp0 = ShiftRightVectorUI32(res, 4);
 324              tmp0 = AddVectorI32(tmp0, res);
 325              res = AndVector(tmp0, c0f0f0f0f);
 326  
 327              tmp0 = ShiftRightVectorUI32(res, 8);
 328              res = AddVectorI32(tmp0, res);
 329  
 330              tmp0 = ShiftRightVectorUI32(res, 16);
 331              res = AddVectorI32(tmp0, res);
 332  
 333              res = AndVector(res, c0000003f);
 334  
 335              return res;
 336          }
 337  
 338          public static void Cnt_V(ArmEmitterContext context)
 339          {
 340              if (Optimizations.UseAdvSimd)
 341              {
 342                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64CntV);
 343              }
 344              else
 345              {
 346                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 347  
 348                  Operand res = context.VectorZero();
 349  
 350                  int elems = op.RegisterSize == RegisterSize.Simd128 ? 16 : 8;
 351  
 352                  for (int index = 0; index < elems; index++)
 353                  {
 354                      Operand ne = EmitVectorExtractZx(context, op.Rn, index, 0);
 355  
 356                      Operand de;
 357  
 358                      if (Optimizations.UsePopCnt)
 359                      {
 360                          de = context.AddIntrinsicLong(Intrinsic.X86Popcnt, ne);
 361                      }
 362                      else
 363                      {
 364                          de = EmitCountSetBits8(context, ne);
 365                      }
 366  
 367                      res = EmitVectorInsert(context, res, de, index, 0);
 368                  }
 369  
 370                  context.Copy(GetVec(op.Rd), res);
 371              }
 372          }
 373  
 374          public static void Fabd_S(ArmEmitterContext context)
 375          {
 376              if (Optimizations.UseAdvSimd)
 377              {
 378                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FabdS);
 379              }
 380              else if (Optimizations.FastFP && Optimizations.UseSse2)
 381              {
 382                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
 383  
 384                  int sizeF = op.Size & 1;
 385  
 386                  if (sizeF == 0)
 387                  {
 388                      Operand res = context.AddIntrinsic(Intrinsic.X86Subss, GetVec(op.Rn), GetVec(op.Rm));
 389  
 390                      res = EmitFloatAbs(context, res, true, false);
 391  
 392                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
 393                  }
 394                  else /* if (sizeF == 1) */
 395                  {
 396                      Operand res = context.AddIntrinsic(Intrinsic.X86Subsd, GetVec(op.Rn), GetVec(op.Rm));
 397  
 398                      res = EmitFloatAbs(context, res, false, false);
 399  
 400                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res));
 401                  }
 402              }
 403              else
 404              {
 405                  EmitScalarBinaryOpF(context, (op1, op2) =>
 406                  {
 407                      Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, op2);
 408  
 409                      return EmitUnaryMathCall(context, nameof(Math.Abs), res);
 410                  });
 411              }
 412          }
 413  
 414          public static void Fabd_V(ArmEmitterContext context)
 415          {
 416              if (Optimizations.UseAdvSimd)
 417              {
 418                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FabdV);
 419              }
 420              else if (Optimizations.FastFP && Optimizations.UseSse2)
 421              {
 422                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
 423  
 424                  int sizeF = op.Size & 1;
 425  
 426                  if (sizeF == 0)
 427                  {
 428                      Operand res = context.AddIntrinsic(Intrinsic.X86Subps, GetVec(op.Rn), GetVec(op.Rm));
 429  
 430                      res = EmitFloatAbs(context, res, true, true);
 431  
 432                      if (op.RegisterSize == RegisterSize.Simd64)
 433                      {
 434                          res = context.VectorZeroUpper64(res);
 435                      }
 436  
 437                      context.Copy(GetVec(op.Rd), res);
 438                  }
 439                  else /* if (sizeF == 1) */
 440                  {
 441                      Operand res = context.AddIntrinsic(Intrinsic.X86Subpd, GetVec(op.Rn), GetVec(op.Rm));
 442  
 443                      res = EmitFloatAbs(context, res, false, true);
 444  
 445                      context.Copy(GetVec(op.Rd), res);
 446                  }
 447              }
 448              else
 449              {
 450                  EmitVectorBinaryOpF(context, (op1, op2) =>
 451                  {
 452                      Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, op2);
 453  
 454                      return EmitUnaryMathCall(context, nameof(Math.Abs), res);
 455                  });
 456              }
 457          }
 458  
 459          public static void Fabs_S(ArmEmitterContext context)
 460          {
 461              if (Optimizations.UseAdvSimd)
 462              {
 463                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FabsS);
 464              }
 465              else if (Optimizations.UseSse2)
 466              {
 467                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 468  
 469                  if (op.Size == 0)
 470                  {
 471                      Operand res = EmitFloatAbs(context, GetVec(op.Rn), true, false);
 472  
 473                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
 474                  }
 475                  else /* if (op.Size == 1) */
 476                  {
 477                      Operand res = EmitFloatAbs(context, GetVec(op.Rn), false, false);
 478  
 479                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res));
 480                  }
 481              }
 482              else
 483              {
 484                  EmitScalarUnaryOpF(context, (op1) =>
 485                  {
 486                      return EmitUnaryMathCall(context, nameof(Math.Abs), op1);
 487                  });
 488              }
 489          }
 490  
 491          public static void Fabs_V(ArmEmitterContext context)
 492          {
 493              if (Optimizations.UseAdvSimd)
 494              {
 495                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FabsV);
 496              }
 497              else if (Optimizations.UseSse2)
 498              {
 499                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 500  
 501                  int sizeF = op.Size & 1;
 502  
 503                  if (sizeF == 0)
 504                  {
 505                      Operand res = EmitFloatAbs(context, GetVec(op.Rn), true, true);
 506  
 507                      if (op.RegisterSize == RegisterSize.Simd64)
 508                      {
 509                          res = context.VectorZeroUpper64(res);
 510                      }
 511  
 512                      context.Copy(GetVec(op.Rd), res);
 513                  }
 514                  else /* if (sizeF == 1) */
 515                  {
 516                      Operand res = EmitFloatAbs(context, GetVec(op.Rn), false, true);
 517  
 518                      context.Copy(GetVec(op.Rd), res);
 519                  }
 520              }
 521              else
 522              {
 523                  EmitVectorUnaryOpF(context, (op1) =>
 524                  {
 525                      return EmitUnaryMathCall(context, nameof(Math.Abs), op1);
 526                  });
 527              }
 528          }
 529  
 530          public static void Fadd_S(ArmEmitterContext context)
 531          {
 532              if (Optimizations.UseAdvSimd)
 533              {
 534                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FaddS);
 535              }
 536              else if (Optimizations.FastFP && Optimizations.UseSse2)
 537              {
 538                  EmitScalarBinaryOpF(context, Intrinsic.X86Addss, Intrinsic.X86Addsd);
 539              }
 540              else if (Optimizations.FastFP)
 541              {
 542                  EmitScalarBinaryOpF(context, (op1, op2) => context.Add(op1, op2));
 543              }
 544              else
 545              {
 546                  EmitScalarBinaryOpF(context, (op1, op2) =>
 547                  {
 548                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2);
 549                  });
 550              }
 551          }
 552  
 553          public static void Fadd_V(ArmEmitterContext context)
 554          {
 555              if (Optimizations.UseAdvSimd)
 556              {
 557                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FaddV);
 558              }
 559              else if (Optimizations.FastFP && Optimizations.UseSse2)
 560              {
 561                  EmitVectorBinaryOpF(context, Intrinsic.X86Addps, Intrinsic.X86Addpd);
 562              }
 563              else if (Optimizations.FastFP)
 564              {
 565                  EmitVectorBinaryOpF(context, (op1, op2) => context.Add(op1, op2));
 566              }
 567              else
 568              {
 569                  EmitVectorBinaryOpF(context, (op1, op2) =>
 570                  {
 571                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2);
 572                  });
 573              }
 574          }
 575  
 576          public static void Faddp_S(ArmEmitterContext context)
 577          {
 578              if (Optimizations.UseAdvSimd)
 579              {
 580                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FaddpS);
 581              }
 582              else if (Optimizations.FastFP && Optimizations.UseSse3)
 583              {
 584                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
 585  
 586                  if ((op.Size & 1) == 0)
 587                  {
 588                      Operand res = context.AddIntrinsic(Intrinsic.X86Haddps, GetVec(op.Rn), GetVec(op.Rn));
 589  
 590                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
 591                  }
 592                  else /* if ((op.Size & 1) == 1) */
 593                  {
 594                      Operand res = context.AddIntrinsic(Intrinsic.X86Haddpd, GetVec(op.Rn), GetVec(op.Rn));
 595  
 596                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res));
 597                  }
 598              }
 599              else
 600              {
 601                  EmitScalarPairwiseOpF(context, (op1, op2) =>
 602                  {
 603                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2);
 604                  });
 605              }
 606          }
 607  
 608          public static void Faddp_V(ArmEmitterContext context)
 609          {
 610              if (Optimizations.UseAdvSimd)
 611              {
 612                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FaddpV);
 613              }
 614              else if (Optimizations.FastFP && Optimizations.UseSse41)
 615              {
 616                  EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
 617                  {
 618                      return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
 619                      {
 620                          IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
 621  
 622                          Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd;
 623  
 624                          return context.AddIntrinsic(addInst, op1, op2);
 625                      }, scalar: false, op1, op2);
 626                  });
 627              }
 628              else
 629              {
 630                  EmitVectorPairwiseOpF(context, (op1, op2) =>
 631                  {
 632                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2);
 633                  });
 634              }
 635          }
 636  
 637          public static void Fdiv_S(ArmEmitterContext context)
 638          {
 639              if (Optimizations.UseAdvSimd)
 640              {
 641                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FdivS);
 642              }
 643              else if (Optimizations.FastFP && Optimizations.UseSse2)
 644              {
 645                  EmitScalarBinaryOpF(context, Intrinsic.X86Divss, Intrinsic.X86Divsd);
 646              }
 647              else if (Optimizations.FastFP)
 648              {
 649                  EmitScalarBinaryOpF(context, (op1, op2) => context.Divide(op1, op2));
 650              }
 651              else
 652              {
 653                  EmitScalarBinaryOpF(context, (op1, op2) =>
 654                  {
 655                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv), op1, op2);
 656                  });
 657              }
 658          }
 659  
 660          public static void Fdiv_V(ArmEmitterContext context)
 661          {
 662              if (Optimizations.UseAdvSimd)
 663              {
 664                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FdivV);
 665              }
 666              else if (Optimizations.FastFP && Optimizations.UseSse2)
 667              {
 668                  EmitVectorBinaryOpF(context, Intrinsic.X86Divps, Intrinsic.X86Divpd);
 669              }
 670              else if (Optimizations.FastFP)
 671              {
 672                  EmitVectorBinaryOpF(context, (op1, op2) => context.Divide(op1, op2));
 673              }
 674              else
 675              {
 676                  EmitVectorBinaryOpF(context, (op1, op2) =>
 677                  {
 678                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPDiv), op1, op2);
 679                  });
 680              }
 681          }
 682  
 683          public static void Fmadd_S(ArmEmitterContext context) // Fused.
 684          {
 685              if (Optimizations.UseAdvSimd)
 686              {
 687                  InstEmitSimdHelperArm64.EmitScalarTernaryOpF(context, Intrinsic.Arm64FmaddS);
 688              }
 689              else if (Optimizations.FastFP && Optimizations.UseSse2)
 690              {
 691                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
 692  
 693                  Operand d = GetVec(op.Rd);
 694                  Operand a = GetVec(op.Ra);
 695                  Operand n = GetVec(op.Rn);
 696                  Operand m = GetVec(op.Rm);
 697  
 698                  Operand res;
 699  
 700                  if (op.Size == 0)
 701                  {
 702                      if (Optimizations.UseFma)
 703                      {
 704                          res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, a, n, m);
 705                      }
 706                      else
 707                      {
 708                          res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
 709                          res = context.AddIntrinsic(Intrinsic.X86Addss, a, res);
 710                      }
 711  
 712                      context.Copy(d, context.VectorZeroUpper96(res));
 713                  }
 714                  else /* if (op.Size == 1) */
 715                  {
 716                      if (Optimizations.UseFma)
 717                      {
 718                          res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, a, n, m);
 719                      }
 720                      else
 721                      {
 722                          res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
 723                          res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res);
 724                      }
 725  
 726                      context.Copy(d, context.VectorZeroUpper64(res));
 727                  }
 728              }
 729              else
 730              {
 731                  EmitScalarTernaryRaOpF(context, (op1, op2, op3) =>
 732                  {
 733                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
 734                  });
 735              }
 736          }
 737  
 738          public static void Fmax_S(ArmEmitterContext context)
 739          {
 740              if (Optimizations.UseAdvSimd)
 741              {
 742                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FmaxS);
 743              }
 744              else if (Optimizations.FastFP && Optimizations.UseSse41)
 745              {
 746                  EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
 747                  {
 748                      return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
 749                  }, scalar: true);
 750              }
 751              else
 752              {
 753                  EmitScalarBinaryOpF(context, (op1, op2) =>
 754                  {
 755                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2);
 756                  });
 757              }
 758          }
 759  
 760          public static void Fmax_V(ArmEmitterContext context)
 761          {
 762              if (Optimizations.UseAdvSimd)
 763              {
 764                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmaxV);
 765              }
 766              else if (Optimizations.FastFP && Optimizations.UseSse41)
 767              {
 768                  EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
 769                  {
 770                      return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
 771                  }, scalar: false);
 772              }
 773              else
 774              {
 775                  EmitVectorBinaryOpF(context, (op1, op2) =>
 776                  {
 777                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2);
 778                  });
 779              }
 780          }
 781  
 782          public static void Fmaxnm_S(ArmEmitterContext context)
 783          {
 784              if (Optimizations.UseAdvSimd)
 785              {
 786                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FmaxnmS);
 787              }
 788              else if (Optimizations.FastFP && Optimizations.UseSse41)
 789              {
 790                  EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: true);
 791              }
 792              else
 793              {
 794                  EmitScalarBinaryOpF(context, (op1, op2) =>
 795                  {
 796                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2);
 797                  });
 798              }
 799          }
 800  
 801          public static void Fmaxnm_V(ArmEmitterContext context)
 802          {
 803              if (Optimizations.UseAdvSimd)
 804              {
 805                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmaxnmV);
 806              }
 807              else if (Optimizations.FastFP && Optimizations.UseSse41)
 808              {
 809                  EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false);
 810              }
 811              else
 812              {
 813                  EmitVectorBinaryOpF(context, (op1, op2) =>
 814                  {
 815                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2);
 816                  });
 817              }
 818          }
 819  
 820          public static void Fmaxnmp_S(ArmEmitterContext context)
 821          {
 822              if (Optimizations.UseAdvSimd)
 823              {
 824                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FmaxnmpS);
 825              }
 826              else if (Optimizations.FastFP && Optimizations.UseSse41)
 827              {
 828                  EmitSse2ScalarPairwiseOpF(context, (op1, op2) =>
 829                  {
 830                      return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: true, op1, op2);
 831                  });
 832              }
 833              else
 834              {
 835                  EmitScalarPairwiseOpF(context, (op1, op2) =>
 836                  {
 837                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2);
 838                  });
 839              }
 840          }
 841  
 842          public static void Fmaxnmp_V(ArmEmitterContext context)
 843          {
 844              if (Optimizations.UseAdvSimd)
 845              {
 846                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmaxnmpV);
 847              }
 848              else if (Optimizations.FastFP && Optimizations.UseSse41)
 849              {
 850                  EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
 851                  {
 852                      return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false, op1, op2);
 853                  });
 854              }
 855              else
 856              {
 857                  EmitVectorPairwiseOpF(context, (op1, op2) =>
 858                  {
 859                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2);
 860                  });
 861              }
 862          }
 863  
 864          public static void Fmaxnmv_V(ArmEmitterContext context)
 865          {
 866              if (Optimizations.UseAdvSimd)
 867              {
 868                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FmaxnmvV);
 869              }
 870              else if (Optimizations.FastFP && Optimizations.UseSse41)
 871              {
 872                  EmitSse2VectorAcrossVectorOpF(context, (op1, op2) =>
 873                  {
 874                      return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: false, op1, op2);
 875                  });
 876              }
 877              else
 878              {
 879                  EmitVectorAcrossVectorOpF(context, (op1, op2) =>
 880                  {
 881                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2);
 882                  });
 883              }
 884          }
 885  
 886          public static void Fmaxp_S(ArmEmitterContext context)
 887          {
 888              if (Optimizations.UseAdvSimd)
 889              {
 890                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FmaxpS);
 891              }
 892              else if (Optimizations.FastFP && Optimizations.UseSse41)
 893              {
 894                  EmitSse2ScalarPairwiseOpF(context, (op1, op2) =>
 895                  {
 896                      return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
 897                      {
 898                          return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
 899                      }, scalar: true, op1, op2);
 900                  });
 901              }
 902              else
 903              {
 904                  EmitScalarPairwiseOpF(context, (op1, op2) =>
 905                  {
 906                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2);
 907                  });
 908              }
 909          }
 910  
 911          public static void Fmaxp_V(ArmEmitterContext context)
 912          {
 913              if (Optimizations.UseAdvSimd)
 914              {
 915                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmaxpV);
 916              }
 917              else if (Optimizations.FastFP && Optimizations.UseSse41)
 918              {
 919                  EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
 920                  {
 921                      return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
 922                      {
 923                          return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
 924                      }, scalar: false, op1, op2);
 925                  });
 926              }
 927              else
 928              {
 929                  EmitVectorPairwiseOpF(context, (op1, op2) =>
 930                  {
 931                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2);
 932                  });
 933              }
 934          }
 935  
 936          public static void Fmaxv_V(ArmEmitterContext context)
 937          {
 938              if (Optimizations.UseAdvSimd)
 939              {
 940                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FmaxvV);
 941              }
 942              else if (Optimizations.FastFP && Optimizations.UseSse41)
 943              {
 944                  EmitSse2VectorAcrossVectorOpF(context, (op1, op2) =>
 945                  {
 946                      return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
 947                      {
 948                          return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true);
 949                      }, scalar: false, op1, op2);
 950                  });
 951              }
 952              else
 953              {
 954                  EmitVectorAcrossVectorOpF(context, (op1, op2) =>
 955                  {
 956                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMax), op1, op2);
 957                  });
 958              }
 959          }
 960  
 961          public static void Fmin_S(ArmEmitterContext context)
 962          {
 963              if (Optimizations.UseAdvSimd)
 964              {
 965                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FminS);
 966              }
 967              else if (Optimizations.FastFP && Optimizations.UseSse41)
 968              {
 969                  EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
 970                  {
 971                      return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
 972                  }, scalar: true);
 973              }
 974              else
 975              {
 976                  EmitScalarBinaryOpF(context, (op1, op2) =>
 977                  {
 978                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2);
 979                  });
 980              }
 981          }
 982  
 983          public static void Fmin_V(ArmEmitterContext context)
 984          {
 985              if (Optimizations.UseAdvSimd)
 986              {
 987                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FminV);
 988              }
 989              else if (Optimizations.FastFP && Optimizations.UseSse41)
 990              {
 991                  EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
 992                  {
 993                      return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
 994                  }, scalar: false);
 995              }
 996              else
 997              {
 998                  EmitVectorBinaryOpF(context, (op1, op2) =>
 999                  {
1000                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2);
1001                  });
1002              }
1003          }
1004  
1005          public static void Fminnm_S(ArmEmitterContext context)
1006          {
1007              if (Optimizations.UseAdvSimd)
1008              {
1009                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FminnmS);
1010              }
1011              else if (Optimizations.FastFP && Optimizations.UseSse41)
1012              {
1013                  EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: true);
1014              }
1015              else
1016              {
1017                  EmitScalarBinaryOpF(context, (op1, op2) =>
1018                  {
1019                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2);
1020                  });
1021              }
1022          }
1023  
1024          public static void Fminnm_V(ArmEmitterContext context)
1025          {
1026              if (Optimizations.UseAdvSimd)
1027              {
1028                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FminnmV);
1029              }
1030              else if (Optimizations.FastFP && Optimizations.UseSse41)
1031              {
1032                  EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false);
1033              }
1034              else
1035              {
1036                  EmitVectorBinaryOpF(context, (op1, op2) =>
1037                  {
1038                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2);
1039                  });
1040              }
1041          }
1042  
1043          public static void Fminnmp_S(ArmEmitterContext context)
1044          {
1045              if (Optimizations.UseAdvSimd)
1046              {
1047                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FminnmpS);
1048              }
1049              else if (Optimizations.FastFP && Optimizations.UseSse41)
1050              {
1051                  EmitSse2ScalarPairwiseOpF(context, (op1, op2) =>
1052                  {
1053                      return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: true, op1, op2);
1054                  });
1055              }
1056              else
1057              {
1058                  EmitScalarPairwiseOpF(context, (op1, op2) =>
1059                  {
1060                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2);
1061                  });
1062              }
1063          }
1064  
1065          public static void Fminnmp_V(ArmEmitterContext context)
1066          {
1067              if (Optimizations.UseAdvSimd)
1068              {
1069                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FminnmpV);
1070              }
1071              else if (Optimizations.FastFP && Optimizations.UseSse41)
1072              {
1073                  EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
1074                  {
1075                      return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false, op1, op2);
1076                  });
1077              }
1078              else
1079              {
1080                  EmitVectorPairwiseOpF(context, (op1, op2) =>
1081                  {
1082                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2);
1083                  });
1084              }
1085          }
1086  
1087          public static void Fminnmv_V(ArmEmitterContext context)
1088          {
1089              if (Optimizations.UseAdvSimd)
1090              {
1091                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FminnmvV);
1092              }
1093              else if (Optimizations.FastFP && Optimizations.UseSse41)
1094              {
1095                  EmitSse2VectorAcrossVectorOpF(context, (op1, op2) =>
1096                  {
1097                      return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: false, op1, op2);
1098                  });
1099              }
1100              else
1101              {
1102                  EmitVectorAcrossVectorOpF(context, (op1, op2) =>
1103                  {
1104                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2);
1105                  });
1106              }
1107          }
1108  
1109          public static void Fminp_S(ArmEmitterContext context)
1110          {
1111              if (Optimizations.UseAdvSimd)
1112              {
1113                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FminpS);
1114              }
1115              else if (Optimizations.FastFP && Optimizations.UseSse41)
1116              {
1117                  EmitSse2ScalarPairwiseOpF(context, (op1, op2) =>
1118                  {
1119                      return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
1120                      {
1121                          return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
1122                      }, scalar: true, op1, op2);
1123                  });
1124              }
1125              else
1126              {
1127                  EmitScalarPairwiseOpF(context, (op1, op2) =>
1128                  {
1129                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2);
1130                  });
1131              }
1132          }
1133  
1134          public static void Fminp_V(ArmEmitterContext context)
1135          {
1136              if (Optimizations.UseAdvSimd)
1137              {
1138                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FminpV);
1139              }
1140              else if (Optimizations.FastFP && Optimizations.UseSse41)
1141              {
1142                  EmitSse2VectorPairwiseOpF(context, (op1, op2) =>
1143                  {
1144                      return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
1145                      {
1146                          return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
1147                      }, scalar: false, op1, op2);
1148                  });
1149              }
1150              else
1151              {
1152                  EmitVectorPairwiseOpF(context, (op1, op2) =>
1153                  {
1154                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2);
1155                  });
1156              }
1157          }
1158  
1159          public static void Fminv_V(ArmEmitterContext context)
1160          {
1161              if (Optimizations.UseAdvSimd)
1162              {
1163                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FminvV);
1164              }
1165              else if (Optimizations.FastFP && Optimizations.UseSse41)
1166              {
1167                  EmitSse2VectorAcrossVectorOpF(context, (op1, op2) =>
1168                  {
1169                      return EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
1170                      {
1171                          return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false);
1172                      }, scalar: false, op1, op2);
1173                  });
1174              }
1175              else
1176              {
1177                  EmitVectorAcrossVectorOpF(context, (op1, op2) =>
1178                  {
1179                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMin), op1, op2);
1180                  });
1181              }
1182          }
1183  
1184          public static void Fmla_Se(ArmEmitterContext context) // Fused.
1185          {
1186              if (Optimizations.UseAdvSimd)
1187              {
1188                  InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlaSe);
1189              }
1190              else if (Optimizations.UseFma)
1191              {
1192                  OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;
1193  
1194                  Operand d = GetVec(op.Rd);
1195                  Operand n = GetVec(op.Rn);
1196                  Operand m = GetVec(op.Rm);
1197  
1198                  int sizeF = op.Size & 1;
1199  
1200                  if (sizeF == 0)
1201                  {
1202                      int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6;
1203  
1204                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
1205  
1206                      res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, d, n, res);
1207  
1208                      context.Copy(d, context.VectorZeroUpper96(res));
1209                  }
1210                  else /* if (sizeF == 1) */
1211                  {
1212                      int shuffleMask = op.Index | op.Index << 1;
1213  
1214                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
1215  
1216                      res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, d, n, res);
1217  
1218                      context.Copy(d, context.VectorZeroUpper64(res));
1219                  }
1220              }
1221              else
1222              {
1223                  EmitScalarTernaryOpByElemF(context, (op1, op2, op3) =>
1224                  {
1225                      return context.Add(op1, context.Multiply(op2, op3));
1226                  });
1227              }
1228          }
1229  
1230          public static void Fmla_V(ArmEmitterContext context) // Fused.
1231          {
1232              if (Optimizations.UseAdvSimd)
1233              {
1234                  InstEmitSimdHelperArm64.EmitVectorTernaryOpFRd(context, Intrinsic.Arm64FmlaV);
1235              }
1236              else if (Optimizations.FastFP && Optimizations.UseSse2)
1237              {
1238                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
1239  
1240                  Operand d = GetVec(op.Rd);
1241                  Operand n = GetVec(op.Rn);
1242                  Operand m = GetVec(op.Rm);
1243  
1244                  int sizeF = op.Size & 1;
1245  
1246                  Operand res;
1247  
1248                  if (sizeF == 0)
1249                  {
1250                      if (Optimizations.UseFma)
1251                      {
1252                          res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, m);
1253                      }
1254                      else
1255                      {
1256                          res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
1257                          res = context.AddIntrinsic(Intrinsic.X86Addps, d, res);
1258                      }
1259  
1260                      if (op.RegisterSize == RegisterSize.Simd64)
1261                      {
1262                          res = context.VectorZeroUpper64(res);
1263                      }
1264  
1265                      context.Copy(d, res);
1266                  }
1267                  else /* if (sizeF == 1) */
1268                  {
1269                      if (Optimizations.UseFma)
1270                      {
1271                          res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, m);
1272                      }
1273                      else
1274                      {
1275                          res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
1276                          res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res);
1277                      }
1278  
1279                      context.Copy(d, res);
1280                  }
1281              }
1282              else
1283              {
1284                  EmitVectorTernaryOpF(context, (op1, op2, op3) =>
1285                  {
1286                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
1287                  });
1288              }
1289          }
1290  
1291          public static void Fmla_Ve(ArmEmitterContext context) // Fused.
1292          {
1293              if (Optimizations.UseAdvSimd)
1294              {
1295                  InstEmitSimdHelperArm64.EmitVectorTernaryOpFRdByElem(context, Intrinsic.Arm64FmlaVe);
1296              }
1297              else if (Optimizations.FastFP && Optimizations.UseSse2)
1298              {
1299                  OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;
1300  
1301                  Operand d = GetVec(op.Rd);
1302                  Operand n = GetVec(op.Rn);
1303                  Operand m = GetVec(op.Rm);
1304  
1305                  int sizeF = op.Size & 1;
1306  
1307                  if (sizeF == 0)
1308                  {
1309                      int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6;
1310  
1311                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
1312  
1313                      if (Optimizations.UseFma)
1314                      {
1315                          res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, res);
1316                      }
1317                      else
1318                      {
1319                          res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res);
1320                          res = context.AddIntrinsic(Intrinsic.X86Addps, d, res);
1321                      }
1322  
1323                      if (op.RegisterSize == RegisterSize.Simd64)
1324                      {
1325                          res = context.VectorZeroUpper64(res);
1326                      }
1327  
1328                      context.Copy(d, res);
1329                  }
1330                  else /* if (sizeF == 1) */
1331                  {
1332                      int shuffleMask = op.Index | op.Index << 1;
1333  
1334                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
1335  
1336                      if (Optimizations.UseFma)
1337                      {
1338                          res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, res);
1339                      }
1340                      else
1341                      {
1342                          res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res);
1343                          res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res);
1344                      }
1345  
1346                      context.Copy(d, res);
1347                  }
1348              }
1349              else
1350              {
1351                  EmitVectorTernaryOpByElemF(context, (op1, op2, op3) =>
1352                  {
1353                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), op1, op2, op3);
1354                  });
1355              }
1356          }
1357  
1358          public static void Fmls_Se(ArmEmitterContext context) // Fused.
1359          {
1360              if (Optimizations.UseAdvSimd)
1361              {
1362                  InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlsSe);
1363              }
1364              else if (Optimizations.UseFma)
1365              {
1366                  OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;
1367  
1368                  Operand d = GetVec(op.Rd);
1369                  Operand n = GetVec(op.Rn);
1370                  Operand m = GetVec(op.Rm);
1371  
1372                  int sizeF = op.Size & 1;
1373  
1374                  if (sizeF == 0)
1375                  {
1376                      int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6;
1377  
1378                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
1379  
1380                      res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, d, n, res);
1381  
1382                      context.Copy(d, context.VectorZeroUpper96(res));
1383                  }
1384                  else /* if (sizeF == 1) */
1385                  {
1386                      int shuffleMask = op.Index | op.Index << 1;
1387  
1388                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
1389  
1390                      res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, d, n, res);
1391  
1392                      context.Copy(d, context.VectorZeroUpper64(res));
1393                  }
1394              }
1395              else
1396              {
1397                  EmitScalarTernaryOpByElemF(context, (op1, op2, op3) =>
1398                  {
1399                      return context.Subtract(op1, context.Multiply(op2, op3));
1400                  });
1401              }
1402          }
1403  
1404          public static void Fmls_V(ArmEmitterContext context) // Fused.
1405          {
1406              if (Optimizations.UseAdvSimd)
1407              {
1408                  InstEmitSimdHelperArm64.EmitVectorTernaryOpFRd(context, Intrinsic.Arm64FmlsV);
1409              }
1410              else if (Optimizations.FastFP && Optimizations.UseSse2)
1411              {
1412                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
1413  
1414                  Operand d = GetVec(op.Rd);
1415                  Operand n = GetVec(op.Rn);
1416                  Operand m = GetVec(op.Rm);
1417  
1418                  int sizeF = op.Size & 1;
1419  
1420                  Operand res;
1421  
1422                  if (sizeF == 0)
1423                  {
1424                      if (Optimizations.UseFma)
1425                      {
1426                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, m);
1427                      }
1428                      else
1429                      {
1430                          res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
1431                          res = context.AddIntrinsic(Intrinsic.X86Subps, d, res);
1432                      }
1433  
1434                      if (op.RegisterSize == RegisterSize.Simd64)
1435                      {
1436                          res = context.VectorZeroUpper64(res);
1437                      }
1438  
1439                      context.Copy(d, res);
1440                  }
1441                  else /* if (sizeF == 1) */
1442                  {
1443                      if (Optimizations.UseFma)
1444                      {
1445                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, m);
1446                      }
1447                      else
1448                      {
1449                          res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
1450                          res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res);
1451                      }
1452  
1453                      context.Copy(d, res);
1454                  }
1455              }
1456              else
1457              {
1458                  EmitVectorTernaryOpF(context, (op1, op2, op3) =>
1459                  {
1460                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3);
1461                  });
1462              }
1463          }
1464  
1465          public static void Fmls_Ve(ArmEmitterContext context) // Fused.
1466          {
1467              if (Optimizations.UseAdvSimd)
1468              {
1469                  InstEmitSimdHelperArm64.EmitVectorTernaryOpFRdByElem(context, Intrinsic.Arm64FmlsVe);
1470              }
1471              else if (Optimizations.FastFP && Optimizations.UseSse2)
1472              {
1473                  OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;
1474  
1475                  Operand d = GetVec(op.Rd);
1476                  Operand n = GetVec(op.Rn);
1477                  Operand m = GetVec(op.Rm);
1478  
1479                  int sizeF = op.Size & 1;
1480  
1481                  if (sizeF == 0)
1482                  {
1483                      int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6;
1484  
1485                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
1486  
1487                      if (Optimizations.UseFma)
1488                      {
1489                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, res);
1490                      }
1491                      else
1492                      {
1493                          res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res);
1494                          res = context.AddIntrinsic(Intrinsic.X86Subps, d, res);
1495                      }
1496  
1497                      if (op.RegisterSize == RegisterSize.Simd64)
1498                      {
1499                          res = context.VectorZeroUpper64(res);
1500                      }
1501  
1502                      context.Copy(d, res);
1503                  }
1504                  else /* if (sizeF == 1) */
1505                  {
1506                      int shuffleMask = op.Index | op.Index << 1;
1507  
1508                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
1509  
1510                      if (Optimizations.UseFma)
1511                      {
1512                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, res);
1513                      }
1514                      else
1515                      {
1516                          res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res);
1517                          res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res);
1518                      }
1519  
1520                      context.Copy(d, res);
1521                  }
1522              }
1523              else
1524              {
1525                  EmitVectorTernaryOpByElemF(context, (op1, op2, op3) =>
1526                  {
1527                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3);
1528                  });
1529              }
1530          }
1531  
1532          public static void Fmsub_S(ArmEmitterContext context) // Fused.
1533          {
1534              if (Optimizations.UseAdvSimd)
1535              {
1536                  InstEmitSimdHelperArm64.EmitScalarTernaryOpF(context, Intrinsic.Arm64FmsubS);
1537              }
1538              else if (Optimizations.FastFP && Optimizations.UseSse2)
1539              {
1540                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
1541  
1542                  Operand d = GetVec(op.Rd);
1543                  Operand a = GetVec(op.Ra);
1544                  Operand n = GetVec(op.Rn);
1545                  Operand m = GetVec(op.Rm);
1546  
1547                  Operand res;
1548  
1549                  if (op.Size == 0)
1550                  {
1551                      if (Optimizations.UseFma)
1552                      {
1553                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, a, n, m);
1554                      }
1555                      else
1556                      {
1557                          res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
1558                          res = context.AddIntrinsic(Intrinsic.X86Subss, a, res);
1559                      }
1560  
1561                      context.Copy(d, context.VectorZeroUpper96(res));
1562                  }
1563                  else /* if (op.Size == 1) */
1564                  {
1565                      if (Optimizations.UseFma)
1566                      {
1567                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, a, n, m);
1568                      }
1569                      else
1570                      {
1571                          res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
1572                          res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res);
1573                      }
1574  
1575                      context.Copy(d, context.VectorZeroUpper64(res));
1576                  }
1577              }
1578              else
1579              {
1580                  EmitScalarTernaryRaOpF(context, (op1, op2, op3) =>
1581                  {
1582                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulSub), op1, op2, op3);
1583                  });
1584              }
1585          }
1586  
1587          public static void Fmul_S(ArmEmitterContext context)
1588          {
1589              if (Optimizations.UseAdvSimd)
1590              {
1591                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FmulS);
1592              }
1593              else if (Optimizations.FastFP && Optimizations.UseSse2)
1594              {
1595                  EmitScalarBinaryOpF(context, Intrinsic.X86Mulss, Intrinsic.X86Mulsd);
1596              }
1597              else if (Optimizations.FastFP)
1598              {
1599                  EmitScalarBinaryOpF(context, (op1, op2) => context.Multiply(op1, op2));
1600              }
1601              else
1602              {
1603                  EmitScalarBinaryOpF(context, (op1, op2) =>
1604                  {
1605                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2);
1606                  });
1607              }
1608          }
1609  
1610          public static void Fmul_Se(ArmEmitterContext context)
1611          {
1612              if (Optimizations.UseAdvSimd)
1613              {
1614                  InstEmitSimdHelperArm64.EmitScalarBinaryOpFByElem(context, Intrinsic.Arm64FmulSe);
1615              }
1616              else
1617              {
1618                  EmitScalarBinaryOpByElemF(context, (op1, op2) => context.Multiply(op1, op2));
1619              }
1620          }
1621  
1622          public static void Fmul_V(ArmEmitterContext context)
1623          {
1624              if (Optimizations.UseAdvSimd)
1625              {
1626                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmulV);
1627              }
1628              else if (Optimizations.FastFP && Optimizations.UseSse2)
1629              {
1630                  EmitVectorBinaryOpF(context, Intrinsic.X86Mulps, Intrinsic.X86Mulpd);
1631              }
1632              else if (Optimizations.FastFP)
1633              {
1634                  EmitVectorBinaryOpF(context, (op1, op2) => context.Multiply(op1, op2));
1635              }
1636              else
1637              {
1638                  EmitVectorBinaryOpF(context, (op1, op2) =>
1639                  {
1640                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2);
1641                  });
1642              }
1643          }
1644  
1645          public static void Fmul_Ve(ArmEmitterContext context)
1646          {
1647              if (Optimizations.UseAdvSimd)
1648              {
1649                  InstEmitSimdHelperArm64.EmitVectorBinaryOpFByElem(context, Intrinsic.Arm64FmulVe);
1650              }
1651              else if (Optimizations.FastFP && Optimizations.UseSse2)
1652              {
1653                  OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;
1654  
1655                  Operand n = GetVec(op.Rn);
1656                  Operand m = GetVec(op.Rm);
1657  
1658                  int sizeF = op.Size & 1;
1659  
1660                  if (sizeF == 0)
1661                  {
1662                      int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6;
1663  
1664                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask));
1665  
1666                      res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res);
1667  
1668                      if (op.RegisterSize == RegisterSize.Simd64)
1669                      {
1670                          res = context.VectorZeroUpper64(res);
1671                      }
1672  
1673                      context.Copy(GetVec(op.Rd), res);
1674                  }
1675                  else /* if (sizeF == 1) */
1676                  {
1677                      int shuffleMask = op.Index | op.Index << 1;
1678  
1679                      Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask));
1680  
1681                      res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res);
1682  
1683                      context.Copy(GetVec(op.Rd), res);
1684                  }
1685              }
1686              else if (Optimizations.FastFP)
1687              {
1688                  EmitVectorBinaryOpByElemF(context, (op1, op2) => context.Multiply(op1, op2));
1689              }
1690              else
1691              {
1692                  EmitVectorBinaryOpByElemF(context, (op1, op2) =>
1693                  {
1694                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMul), op1, op2);
1695                  });
1696              }
1697          }
1698  
1699          public static void Fmulx_S(ArmEmitterContext context)
1700          {
1701              if (Optimizations.UseAdvSimd)
1702              {
1703                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FmulxS);
1704              }
1705              else
1706              {
1707                  EmitScalarBinaryOpF(context, (op1, op2) =>
1708                  {
1709                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX), op1, op2);
1710                  });
1711              }
1712          }
1713  
1714          public static void Fmulx_Se(ArmEmitterContext context)
1715          {
1716              if (Optimizations.UseAdvSimd)
1717              {
1718                  InstEmitSimdHelperArm64.EmitScalarBinaryOpFByElem(context, Intrinsic.Arm64FmulxSe);
1719              }
1720              else
1721              {
1722                  EmitScalarBinaryOpByElemF(context, (op1, op2) =>
1723                  {
1724                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX), op1, op2);
1725                  });
1726              }
1727          }
1728  
1729          public static void Fmulx_V(ArmEmitterContext context)
1730          {
1731              if (Optimizations.UseAdvSimd)
1732              {
1733                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FmulxV);
1734              }
1735              else
1736              {
1737                  EmitVectorBinaryOpF(context, (op1, op2) =>
1738                  {
1739                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX), op1, op2);
1740                  });
1741              }
1742          }
1743  
1744          public static void Fmulx_Ve(ArmEmitterContext context)
1745          {
1746              if (Optimizations.UseAdvSimd)
1747              {
1748                  InstEmitSimdHelperArm64.EmitVectorBinaryOpFByElem(context, Intrinsic.Arm64FmulxVe);
1749              }
1750              else
1751              {
1752                  EmitVectorBinaryOpByElemF(context, (op1, op2) =>
1753                  {
1754                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulX), op1, op2);
1755                  });
1756              }
1757          }
1758  
1759          public static void Fneg_S(ArmEmitterContext context)
1760          {
1761              if (Optimizations.UseAdvSimd)
1762              {
1763                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FnegS);
1764              }
1765              else if (Optimizations.UseSse2)
1766              {
1767                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
1768  
1769                  if (op.Size == 0)
1770                  {
1771                      Operand mask = X86GetScalar(context, -0f);
1772  
1773                      Operand res = context.AddIntrinsic(Intrinsic.X86Xorps, mask, GetVec(op.Rn));
1774  
1775                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
1776                  }
1777                  else /* if (op.Size == 1) */
1778                  {
1779                      Operand mask = X86GetScalar(context, -0d);
1780  
1781                      Operand res = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, GetVec(op.Rn));
1782  
1783                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res));
1784                  }
1785              }
1786              else
1787              {
1788                  EmitScalarUnaryOpF(context, (op1) => context.Negate(op1));
1789              }
1790          }
1791  
1792          public static void Fneg_V(ArmEmitterContext context)
1793          {
1794              if (Optimizations.UseAdvSimd)
1795              {
1796                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FnegV);
1797              }
1798              else if (Optimizations.UseSse2)
1799              {
1800                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
1801  
1802                  int sizeF = op.Size & 1;
1803  
1804                  if (sizeF == 0)
1805                  {
1806                      Operand mask = X86GetAllElements(context, -0f);
1807  
1808                      Operand res = context.AddIntrinsic(Intrinsic.X86Xorps, mask, GetVec(op.Rn));
1809  
1810                      if (op.RegisterSize == RegisterSize.Simd64)
1811                      {
1812                          res = context.VectorZeroUpper64(res);
1813                      }
1814  
1815                      context.Copy(GetVec(op.Rd), res);
1816                  }
1817                  else /* if (sizeF == 1) */
1818                  {
1819                      Operand mask = X86GetAllElements(context, -0d);
1820  
1821                      Operand res = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, GetVec(op.Rn));
1822  
1823                      context.Copy(GetVec(op.Rd), res);
1824                  }
1825              }
1826              else
1827              {
1828                  EmitVectorUnaryOpF(context, (op1) => context.Negate(op1));
1829              }
1830          }
1831  
1832          public static void Fnmadd_S(ArmEmitterContext context) // Fused.
1833          {
1834              if (Optimizations.UseAdvSimd)
1835              {
1836                  InstEmitSimdHelperArm64.EmitScalarTernaryOpF(context, Intrinsic.Arm64FnmaddS);
1837              }
1838              else if (Optimizations.FastFP && Optimizations.UseSse2)
1839              {
1840                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
1841  
1842                  Operand d = GetVec(op.Rd);
1843                  Operand a = GetVec(op.Ra);
1844                  Operand n = GetVec(op.Rn);
1845                  Operand m = GetVec(op.Rm);
1846  
1847                  Operand res;
1848  
1849                  if (op.Size == 0)
1850                  {
1851                      if (Optimizations.UseFma)
1852                      {
1853                          res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231ss, a, n, m);
1854                      }
1855                      else
1856                      {
1857                          Operand mask = X86GetScalar(context, -0f);
1858                          Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a);
1859  
1860                          res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
1861                          res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res);
1862                      }
1863  
1864                      context.Copy(d, context.VectorZeroUpper96(res));
1865                  }
1866                  else /* if (op.Size == 1) */
1867                  {
1868                      if (Optimizations.UseFma)
1869                      {
1870                          res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231sd, a, n, m);
1871                      }
1872                      else
1873                      {
1874                          Operand mask = X86GetScalar(context, -0d);
1875                          Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a);
1876  
1877                          res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
1878                          res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res);
1879                      }
1880  
1881                      context.Copy(d, context.VectorZeroUpper64(res));
1882                  }
1883              }
1884              else
1885              {
1886                  EmitScalarTernaryRaOpF(context, (op1, op2, op3) =>
1887                  {
1888                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulAdd), op1, op2, op3);
1889                  });
1890              }
1891          }
1892  
1893          public static void Fnmsub_S(ArmEmitterContext context) // Fused.
1894          {
1895              if (Optimizations.UseAdvSimd)
1896              {
1897                  InstEmitSimdHelperArm64.EmitScalarTernaryOpF(context, Intrinsic.Arm64FnmsubS);
1898              }
1899              else if (Optimizations.FastFP && Optimizations.UseSse2)
1900              {
1901                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
1902  
1903                  Operand d = GetVec(op.Rd);
1904                  Operand a = GetVec(op.Ra);
1905                  Operand n = GetVec(op.Rn);
1906                  Operand m = GetVec(op.Rm);
1907  
1908                  Operand res;
1909  
1910                  if (op.Size == 0)
1911                  {
1912                      if (Optimizations.UseFma)
1913                      {
1914                          res = context.AddIntrinsic(Intrinsic.X86Vfmsub231ss, a, n, m);
1915                      }
1916                      else
1917                      {
1918                          Operand mask = X86GetScalar(context, -0f);
1919                          Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a);
1920  
1921                          res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
1922                          res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res);
1923                      }
1924  
1925                      context.Copy(d, context.VectorZeroUpper96(res));
1926                  }
1927                  else /* if (op.Size == 1) */
1928                  {
1929                      if (Optimizations.UseFma)
1930                      {
1931                          res = context.AddIntrinsic(Intrinsic.X86Vfmsub231sd, a, n, m);
1932                      }
1933                      else
1934                      {
1935                          Operand mask = X86GetScalar(context, -0d);
1936                          Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a);
1937  
1938                          res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
1939                          res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res);
1940                      }
1941  
1942                      context.Copy(d, context.VectorZeroUpper64(res));
1943                  }
1944              }
1945              else
1946              {
1947                  EmitScalarTernaryRaOpF(context, (op1, op2, op3) =>
1948                  {
1949                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPNegMulSub), op1, op2, op3);
1950                  });
1951              }
1952          }
1953  
1954          public static void Fnmul_S(ArmEmitterContext context)
1955          {
1956              if (Optimizations.UseAdvSimd)
1957              {
1958                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FnmulS);
1959              }
1960              else
1961              {
1962                  EmitScalarBinaryOpF(context, (op1, op2) => context.Negate(context.Multiply(op1, op2)));
1963              }
1964          }
1965  
1966          public static void Frecpe_S(ArmEmitterContext context)
1967          {
1968              OpCodeSimd op = (OpCodeSimd)context.CurrOp;
1969  
1970              int sizeF = op.Size & 1;
1971  
1972              if (Optimizations.UseAdvSimd)
1973              {
1974                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrecpeS);
1975              }
1976              else if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0)
1977              {
1978                  Operand res = EmitSse41Round32Exp8OpF(context, context.AddIntrinsic(Intrinsic.X86Rcpss, GetVec(op.Rn)), scalar: true);
1979  
1980                  context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
1981              }
1982              else
1983              {
1984                  EmitScalarUnaryOpF(context, (op1) =>
1985                  {
1986                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipEstimate), op1);
1987                  });
1988              }
1989          }
1990  
1991          public static void Frecpe_V(ArmEmitterContext context)
1992          {
1993              OpCodeSimd op = (OpCodeSimd)context.CurrOp;
1994  
1995              int sizeF = op.Size & 1;
1996  
1997              if (Optimizations.UseAdvSimd)
1998              {
1999                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrecpeV);
2000              }
2001              else if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0)
2002              {
2003                  Operand res = EmitSse41Round32Exp8OpF(context, context.AddIntrinsic(Intrinsic.X86Rcpps, GetVec(op.Rn)), scalar: false);
2004  
2005                  if (op.RegisterSize == RegisterSize.Simd64)
2006                  {
2007                      res = context.VectorZeroUpper64(res);
2008                  }
2009  
2010                  context.Copy(GetVec(op.Rd), res);
2011              }
2012              else
2013              {
2014                  EmitVectorUnaryOpF(context, (op1) =>
2015                  {
2016                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipEstimate), op1);
2017                  });
2018              }
2019          }
2020  
2021          public static void Frecps_S(ArmEmitterContext context) // Fused.
2022          {
2023              if (Optimizations.UseAdvSimd)
2024              {
2025                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FrecpsS);
2026              }
2027              else if (Optimizations.FastFP && Optimizations.UseSse41)
2028              {
2029                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
2030  
2031                  Operand n = GetVec(op.Rn);
2032                  Operand m = GetVec(op.Rm);
2033  
2034                  int sizeF = op.Size & 1;
2035  
2036                  Operand res;
2037  
2038                  if (sizeF == 0)
2039                  {
2040                      Operand mask = X86GetScalar(context, 2f);
2041  
2042                      if (Optimizations.UseFma)
2043                      {
2044                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, mask, n, m);
2045                      }
2046                      else
2047                      {
2048                          res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
2049                          res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res);
2050                      }
2051  
2052                      res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF);
2053  
2054                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
2055                  }
2056                  else /* if (sizeF == 1) */
2057                  {
2058                      Operand mask = X86GetScalar(context, 2d);
2059  
2060                      if (Optimizations.UseFma)
2061                      {
2062                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, mask, n, m);
2063                      }
2064                      else
2065                      {
2066                          res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
2067                          res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res);
2068                      }
2069  
2070                      res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF);
2071  
2072                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res));
2073                  }
2074              }
2075              else
2076              {
2077                  EmitScalarBinaryOpF(context, (op1, op2) =>
2078                  {
2079                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStepFused), op1, op2);
2080                  });
2081              }
2082          }
2083  
2084          public static void Frecps_V(ArmEmitterContext context) // Fused.
2085          {
2086              if (Optimizations.UseAdvSimd)
2087              {
2088                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FrecpsV);
2089              }
2090              else if (Optimizations.FastFP && Optimizations.UseSse41)
2091              {
2092                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
2093  
2094                  Operand n = GetVec(op.Rn);
2095                  Operand m = GetVec(op.Rm);
2096  
2097                  int sizeF = op.Size & 1;
2098  
2099                  Operand res;
2100  
2101                  if (sizeF == 0)
2102                  {
2103                      Operand mask = X86GetAllElements(context, 2f);
2104  
2105                      if (Optimizations.UseFma)
2106                      {
2107                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, mask, n, m);
2108                      }
2109                      else
2110                      {
2111                          res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
2112                          res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res);
2113                      }
2114  
2115                      res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF);
2116  
2117                      if (op.RegisterSize == RegisterSize.Simd64)
2118                      {
2119                          res = context.VectorZeroUpper64(res);
2120                      }
2121  
2122                      context.Copy(GetVec(op.Rd), res);
2123                  }
2124                  else /* if (sizeF == 1) */
2125                  {
2126                      Operand mask = X86GetAllElements(context, 2d);
2127  
2128                      if (Optimizations.UseFma)
2129                      {
2130                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, mask, n, m);
2131                      }
2132                      else
2133                      {
2134                          res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
2135                          res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res);
2136                      }
2137  
2138                      res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF);
2139  
2140                      context.Copy(GetVec(op.Rd), res);
2141                  }
2142              }
2143              else
2144              {
2145                  EmitVectorBinaryOpF(context, (op1, op2) =>
2146                  {
2147                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecipStepFused), op1, op2);
2148                  });
2149              }
2150          }
2151  
2152          public static void Frecpx_S(ArmEmitterContext context)
2153          {
2154              if (Optimizations.UseAdvSimd)
2155              {
2156                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FrecpxS);
2157              }
2158              else
2159              {
2160                  EmitScalarUnaryOpF(context, (op1) =>
2161                  {
2162                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRecpX), op1);
2163                  });
2164              }
2165          }
2166  
2167          public static void Frinta_S(ArmEmitterContext context)
2168          {
2169              if (Optimizations.UseAdvSimd)
2170              {
2171                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintaS);
2172              }
2173              else if (Optimizations.UseSse41)
2174              {
2175                  EmitSse41ScalarRoundOpF(context, FPRoundingMode.ToNearestAway);
2176              }
2177              else
2178              {
2179                  EmitScalarUnaryOpF(context, (op1) =>
2180                  {
2181                      return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1);
2182                  });
2183              }
2184          }
2185  
2186          public static void Frinta_V(ArmEmitterContext context)
2187          {
2188              if (Optimizations.UseAdvSimd)
2189              {
2190                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintaV);
2191              }
2192              else if (Optimizations.UseSse41)
2193              {
2194                  EmitSse41VectorRoundOpF(context, FPRoundingMode.ToNearestAway);
2195              }
2196              else
2197              {
2198                  EmitVectorUnaryOpF(context, (op1) =>
2199                  {
2200                      return EmitRoundMathCall(context, MidpointRounding.AwayFromZero, op1);
2201                  });
2202              }
2203          }
2204  
2205          public static void Frinti_S(ArmEmitterContext context)
2206          {
2207              if (Optimizations.UseAdvSimd)
2208              {
2209                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintiS);
2210              }
2211              else
2212              {
2213                  EmitScalarUnaryOpF(context, (op1) =>
2214                  {
2215                      return EmitRoundByRMode(context, op1);
2216                  });
2217              }
2218          }
2219  
2220          public static void Frinti_V(ArmEmitterContext context)
2221          {
2222              if (Optimizations.UseAdvSimd)
2223              {
2224                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintiV);
2225              }
2226              else
2227              {
2228                  EmitVectorUnaryOpF(context, (op1) =>
2229                  {
2230                      return EmitRoundByRMode(context, op1);
2231                  });
2232              }
2233          }
2234  
2235          public static void Frintm_S(ArmEmitterContext context)
2236          {
2237              if (Optimizations.UseAdvSimd)
2238              {
2239                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintmS);
2240              }
2241              else if (Optimizations.UseSse41)
2242              {
2243                  EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsMinusInfinity);
2244              }
2245              else
2246              {
2247                  EmitScalarUnaryOpF(context, (op1) =>
2248                  {
2249                      return EmitUnaryMathCall(context, nameof(Math.Floor), op1);
2250                  });
2251              }
2252          }
2253  
2254          public static void Frintm_V(ArmEmitterContext context)
2255          {
2256              if (Optimizations.UseAdvSimd)
2257              {
2258                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintmV);
2259              }
2260              else if (Optimizations.UseSse41)
2261              {
2262                  EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsMinusInfinity);
2263              }
2264              else
2265              {
2266                  EmitVectorUnaryOpF(context, (op1) =>
2267                  {
2268                      return EmitUnaryMathCall(context, nameof(Math.Floor), op1);
2269                  });
2270              }
2271          }
2272  
2273          public static void Frintn_S(ArmEmitterContext context)
2274          {
2275              if (Optimizations.UseAdvSimd)
2276              {
2277                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintnS);
2278              }
2279              else if (Optimizations.UseSse41)
2280              {
2281                  EmitSse41ScalarRoundOpF(context, FPRoundingMode.ToNearest);
2282              }
2283              else
2284              {
2285                  EmitScalarUnaryOpF(context, (op1) =>
2286                  {
2287                      return EmitRoundMathCall(context, MidpointRounding.ToEven, op1);
2288                  });
2289              }
2290          }
2291  
2292          public static void Frintn_V(ArmEmitterContext context)
2293          {
2294              if (Optimizations.UseAdvSimd)
2295              {
2296                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintnV);
2297              }
2298              else if (Optimizations.UseSse41)
2299              {
2300                  EmitSse41VectorRoundOpF(context, FPRoundingMode.ToNearest);
2301              }
2302              else
2303              {
2304                  EmitVectorUnaryOpF(context, (op1) =>
2305                  {
2306                      return EmitRoundMathCall(context, MidpointRounding.ToEven, op1);
2307                  });
2308              }
2309          }
2310  
2311          public static void Frintp_S(ArmEmitterContext context)
2312          {
2313              if (Optimizations.UseAdvSimd)
2314              {
2315                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintpS);
2316              }
2317              else if (Optimizations.UseSse41)
2318              {
2319                  EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsPlusInfinity);
2320              }
2321              else
2322              {
2323                  EmitScalarUnaryOpF(context, (op1) =>
2324                  {
2325                      return EmitUnaryMathCall(context, nameof(Math.Ceiling), op1);
2326                  });
2327              }
2328          }
2329  
2330          public static void Frintp_V(ArmEmitterContext context)
2331          {
2332              if (Optimizations.UseAdvSimd)
2333              {
2334                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintpV);
2335              }
2336              else if (Optimizations.UseSse41)
2337              {
2338                  EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsPlusInfinity);
2339              }
2340              else
2341              {
2342                  EmitVectorUnaryOpF(context, (op1) =>
2343                  {
2344                      return EmitUnaryMathCall(context, nameof(Math.Ceiling), op1);
2345                  });
2346              }
2347          }
2348  
2349          public static void Frintx_S(ArmEmitterContext context)
2350          {
2351              if (Optimizations.UseAdvSimd)
2352              {
2353                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintxS);
2354              }
2355              else
2356              {
2357                  EmitScalarUnaryOpF(context, (op1) =>
2358                  {
2359                      return EmitRoundByRMode(context, op1);
2360                  });
2361              }
2362          }
2363  
2364          public static void Frintx_V(ArmEmitterContext context)
2365          {
2366              if (Optimizations.UseAdvSimd)
2367              {
2368                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintxV);
2369              }
2370              else
2371              {
2372                  EmitVectorUnaryOpF(context, (op1) =>
2373                  {
2374                      return EmitRoundByRMode(context, op1);
2375                  });
2376              }
2377          }
2378  
2379          public static void Frintz_S(ArmEmitterContext context)
2380          {
2381              if (Optimizations.UseAdvSimd)
2382              {
2383                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintzS);
2384              }
2385              else if (Optimizations.UseSse41)
2386              {
2387                  EmitSse41ScalarRoundOpF(context, FPRoundingMode.TowardsZero);
2388              }
2389              else
2390              {
2391                  EmitScalarUnaryOpF(context, (op1) =>
2392                  {
2393                      return EmitUnaryMathCall(context, nameof(Math.Truncate), op1);
2394                  });
2395              }
2396          }
2397  
2398          public static void Frintz_V(ArmEmitterContext context)
2399          {
2400              if (Optimizations.UseAdvSimd)
2401              {
2402                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintzV);
2403              }
2404              else if (Optimizations.UseSse41)
2405              {
2406                  EmitSse41VectorRoundOpF(context, FPRoundingMode.TowardsZero);
2407              }
2408              else
2409              {
2410                  EmitVectorUnaryOpF(context, (op1) =>
2411                  {
2412                      return EmitUnaryMathCall(context, nameof(Math.Truncate), op1);
2413                  });
2414              }
2415          }
2416  
2417          public static void Frsqrte_S(ArmEmitterContext context)
2418          {
2419              OpCodeSimd op = (OpCodeSimd)context.CurrOp;
2420  
2421              int sizeF = op.Size & 1;
2422  
2423              if (Optimizations.UseAdvSimd)
2424              {
2425                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrsqrteS);
2426              }
2427              else if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0)
2428              {
2429                  // RSQRTSS handles subnormals as zero, which differs from Arm, so we can't use it here.
2430  
2431                  Operand res = context.AddIntrinsic(Intrinsic.X86Sqrtss, GetVec(op.Rn));
2432                  res = context.AddIntrinsic(Intrinsic.X86Rcpss, res);
2433                  res = EmitSse41Round32Exp8OpF(context, res, scalar: true);
2434  
2435                  context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
2436              }
2437              else
2438              {
2439                  EmitScalarUnaryOpF(context, (op1) =>
2440                  {
2441                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtEstimate), op1);
2442                  });
2443              }
2444          }
2445  
2446          public static void Frsqrte_V(ArmEmitterContext context)
2447          {
2448              OpCodeSimd op = (OpCodeSimd)context.CurrOp;
2449  
2450              int sizeF = op.Size & 1;
2451  
2452              if (Optimizations.UseAdvSimd)
2453              {
2454                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrsqrteV);
2455              }
2456              else if (Optimizations.FastFP && Optimizations.UseSse41 && sizeF == 0)
2457              {
2458                  // RSQRTPS handles subnormals as zero, which differs from Arm, so we can't use it here.
2459  
2460                  Operand res = context.AddIntrinsic(Intrinsic.X86Sqrtps, GetVec(op.Rn));
2461                  res = context.AddIntrinsic(Intrinsic.X86Rcpps, res);
2462                  res = EmitSse41Round32Exp8OpF(context, res, scalar: false);
2463  
2464                  if (op.RegisterSize == RegisterSize.Simd64)
2465                  {
2466                      res = context.VectorZeroUpper64(res);
2467                  }
2468  
2469                  context.Copy(GetVec(op.Rd), res);
2470              }
2471              else
2472              {
2473                  EmitVectorUnaryOpF(context, (op1) =>
2474                  {
2475                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtEstimate), op1);
2476                  });
2477              }
2478          }
2479  
2480          public static void Frsqrts_S(ArmEmitterContext context) // Fused.
2481          {
2482              if (Optimizations.UseAdvSimd)
2483              {
2484                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FrsqrtsS);
2485              }
2486              else if (Optimizations.FastFP && Optimizations.UseSse41)
2487              {
2488                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
2489  
2490                  Operand n = GetVec(op.Rn);
2491                  Operand m = GetVec(op.Rm);
2492  
2493                  int sizeF = op.Size & 1;
2494  
2495                  Operand res;
2496  
2497                  if (sizeF == 0)
2498                  {
2499                      Operand maskHalf = X86GetScalar(context, 0.5f);
2500                      Operand maskThree = X86GetScalar(context, 3f);
2501                      Operand maskOneHalf = X86GetScalar(context, 1.5f);
2502  
2503                      if (Optimizations.UseFma)
2504                      {
2505                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, maskThree, n, m);
2506                      }
2507                      else
2508                      {
2509                          res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m);
2510                          res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res);
2511                      }
2512  
2513                      res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res);
2514                      res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF);
2515  
2516                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
2517                  }
2518                  else /* if (sizeF == 1) */
2519                  {
2520                      Operand maskHalf = X86GetScalar(context, 0.5d);
2521                      Operand maskThree = X86GetScalar(context, 3d);
2522                      Operand maskOneHalf = X86GetScalar(context, 1.5d);
2523  
2524                      if (Optimizations.UseFma)
2525                      {
2526                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, maskThree, n, m);
2527                      }
2528                      else
2529                      {
2530                          res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m);
2531                          res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res);
2532                      }
2533  
2534                      res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res);
2535                      res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF);
2536  
2537                      context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res));
2538                  }
2539              }
2540              else
2541              {
2542                  EmitScalarBinaryOpF(context, (op1, op2) =>
2543                  {
2544                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtStepFused), op1, op2);
2545                  });
2546              }
2547          }
2548  
2549          public static void Frsqrts_V(ArmEmitterContext context) // Fused.
2550          {
2551              if (Optimizations.UseAdvSimd)
2552              {
2553                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FrsqrtsV);
2554              }
2555              else if (Optimizations.FastFP && Optimizations.UseSse41)
2556              {
2557                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
2558  
2559                  Operand n = GetVec(op.Rn);
2560                  Operand m = GetVec(op.Rm);
2561  
2562                  int sizeF = op.Size & 1;
2563  
2564                  Operand res;
2565  
2566                  if (sizeF == 0)
2567                  {
2568                      Operand maskHalf = X86GetAllElements(context, 0.5f);
2569                      Operand maskThree = X86GetAllElements(context, 3f);
2570                      Operand maskOneHalf = X86GetAllElements(context, 1.5f);
2571  
2572                      if (Optimizations.UseFma)
2573                      {
2574                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, maskThree, n, m);
2575                      }
2576                      else
2577                      {
2578                          res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m);
2579                          res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res);
2580                      }
2581  
2582                      res = context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res);
2583                      res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF);
2584  
2585                      if (op.RegisterSize == RegisterSize.Simd64)
2586                      {
2587                          res = context.VectorZeroUpper64(res);
2588                      }
2589  
2590                      context.Copy(GetVec(op.Rd), res);
2591                  }
2592                  else /* if (sizeF == 1) */
2593                  {
2594                      Operand maskHalf = X86GetAllElements(context, 0.5d);
2595                      Operand maskThree = X86GetAllElements(context, 3d);
2596                      Operand maskOneHalf = X86GetAllElements(context, 1.5d);
2597  
2598                      if (Optimizations.UseFma)
2599                      {
2600                          res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, maskThree, n, m);
2601                      }
2602                      else
2603                      {
2604                          res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m);
2605                          res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res);
2606                      }
2607  
2608                      res = context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res);
2609                      res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF);
2610  
2611                      context.Copy(GetVec(op.Rd), res);
2612                  }
2613              }
2614              else
2615              {
2616                  EmitVectorBinaryOpF(context, (op1, op2) =>
2617                  {
2618                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPRSqrtStepFused), op1, op2);
2619                  });
2620              }
2621          }
2622  
2623          public static void Fsqrt_S(ArmEmitterContext context)
2624          {
2625              if (Optimizations.UseAdvSimd)
2626              {
2627                  InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FsqrtS);
2628              }
2629              else if (Optimizations.FastFP && Optimizations.UseSse2)
2630              {
2631                  EmitScalarUnaryOpF(context, Intrinsic.X86Sqrtss, Intrinsic.X86Sqrtsd);
2632              }
2633              else
2634              {
2635                  EmitScalarUnaryOpF(context, (op1) =>
2636                  {
2637                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt), op1);
2638                  });
2639              }
2640          }
2641  
2642          public static void Fsqrt_V(ArmEmitterContext context)
2643          {
2644              if (Optimizations.UseAdvSimd)
2645              {
2646                  InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FsqrtV);
2647              }
2648              else if (Optimizations.FastFP && Optimizations.UseSse2)
2649              {
2650                  EmitVectorUnaryOpF(context, Intrinsic.X86Sqrtps, Intrinsic.X86Sqrtpd);
2651              }
2652              else
2653              {
2654                  EmitVectorUnaryOpF(context, (op1) =>
2655                  {
2656                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSqrt), op1);
2657                  });
2658              }
2659          }
2660  
2661          public static void Fsub_S(ArmEmitterContext context)
2662          {
2663              if (Optimizations.UseAdvSimd)
2664              {
2665                  InstEmitSimdHelperArm64.EmitScalarBinaryOpF(context, Intrinsic.Arm64FsubS);
2666              }
2667              else if (Optimizations.FastFP && Optimizations.UseSse2)
2668              {
2669                  EmitScalarBinaryOpF(context, Intrinsic.X86Subss, Intrinsic.X86Subsd);
2670              }
2671              else if (Optimizations.FastFP)
2672              {
2673                  EmitScalarBinaryOpF(context, (op1, op2) => context.Subtract(op1, op2));
2674              }
2675              else
2676              {
2677                  EmitScalarBinaryOpF(context, (op1, op2) =>
2678                  {
2679                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, op2);
2680                  });
2681              }
2682          }
2683  
2684          public static void Fsub_V(ArmEmitterContext context)
2685          {
2686              if (Optimizations.UseAdvSimd)
2687              {
2688                  InstEmitSimdHelperArm64.EmitVectorBinaryOpF(context, Intrinsic.Arm64FsubV);
2689              }
2690              else if (Optimizations.FastFP && Optimizations.UseSse2)
2691              {
2692                  EmitVectorBinaryOpF(context, Intrinsic.X86Subps, Intrinsic.X86Subpd);
2693              }
2694              else if (Optimizations.FastFP)
2695              {
2696                  EmitVectorBinaryOpF(context, (op1, op2) => context.Subtract(op1, op2));
2697              }
2698              else
2699              {
2700                  EmitVectorBinaryOpF(context, (op1, op2) =>
2701                  {
2702                      return EmitSoftFloatCall(context, nameof(SoftFloat32.FPSub), op1, op2);
2703                  });
2704              }
2705          }
2706  
2707          public static void Mla_V(ArmEmitterContext context)
2708          {
2709              if (Optimizations.UseAdvSimd)
2710              {
2711                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64MlaV);
2712              }
2713              else if (Optimizations.UseSse41)
2714              {
2715                  EmitSse41VectorMul_AddSub(context, AddSub.Add);
2716              }
2717              else
2718              {
2719                  EmitVectorTernaryOpZx(context, (op1, op2, op3) =>
2720                  {
2721                      return context.Add(op1, context.Multiply(op2, op3));
2722                  });
2723              }
2724          }
2725  
2726          public static void Mla_Ve(ArmEmitterContext context)
2727          {
2728              if (Optimizations.UseAdvSimd)
2729              {
2730                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64MlaVe);
2731              }
2732              else
2733              {
2734                  EmitVectorTernaryOpByElemZx(context, (op1, op2, op3) =>
2735                  {
2736                      return context.Add(op1, context.Multiply(op2, op3));
2737                  });
2738              }
2739          }
2740  
2741          public static void Mls_V(ArmEmitterContext context)
2742          {
2743              if (Optimizations.UseAdvSimd)
2744              {
2745                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64MlsV);
2746              }
2747              else if (Optimizations.UseSse41)
2748              {
2749                  EmitSse41VectorMul_AddSub(context, AddSub.Subtract);
2750              }
2751              else
2752              {
2753                  EmitVectorTernaryOpZx(context, (op1, op2, op3) =>
2754                  {
2755                      return context.Subtract(op1, context.Multiply(op2, op3));
2756                  });
2757              }
2758          }
2759  
2760          public static void Mls_Ve(ArmEmitterContext context)
2761          {
2762              if (Optimizations.UseAdvSimd)
2763              {
2764                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64MlsVe);
2765              }
2766              else
2767              {
2768                  EmitVectorTernaryOpByElemZx(context, (op1, op2, op3) =>
2769                  {
2770                      return context.Subtract(op1, context.Multiply(op2, op3));
2771                  });
2772              }
2773          }
2774  
2775          public static void Mul_V(ArmEmitterContext context)
2776          {
2777              if (Optimizations.UseAdvSimd)
2778              {
2779                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64MulV);
2780              }
2781              else if (Optimizations.UseSse41)
2782              {
2783                  EmitSse41VectorMul_AddSub(context, AddSub.None);
2784              }
2785              else
2786              {
2787                  EmitVectorBinaryOpZx(context, (op1, op2) => context.Multiply(op1, op2));
2788              }
2789          }
2790  
2791          public static void Mul_Ve(ArmEmitterContext context)
2792          {
2793              if (Optimizations.UseAdvSimd)
2794              {
2795                  InstEmitSimdHelperArm64.EmitVectorBinaryOpByElem(context, Intrinsic.Arm64MulVe);
2796              }
2797              else
2798              {
2799                  EmitVectorBinaryOpByElemZx(context, (op1, op2) => context.Multiply(op1, op2));
2800              }
2801          }
2802  
2803          public static void Neg_S(ArmEmitterContext context)
2804          {
2805              if (Optimizations.UseAdvSimd)
2806              {
2807                  InstEmitSimdHelperArm64.EmitScalarUnaryOp(context, Intrinsic.Arm64NegS);
2808              }
2809              else
2810              {
2811                  EmitScalarUnaryOpSx(context, (op1) => context.Negate(op1));
2812              }
2813          }
2814  
2815          public static void Neg_V(ArmEmitterContext context)
2816          {
2817              if (Optimizations.UseAdvSimd)
2818              {
2819                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64NegV);
2820              }
2821              else if (Optimizations.UseSse2)
2822              {
2823                  OpCodeSimd op = (OpCodeSimd)context.CurrOp;
2824  
2825                  Intrinsic subInst = X86PsubInstruction[op.Size];
2826  
2827                  Operand res = context.AddIntrinsic(subInst, context.VectorZero(), GetVec(op.Rn));
2828  
2829                  if (op.RegisterSize == RegisterSize.Simd64)
2830                  {
2831                      res = context.VectorZeroUpper64(res);
2832                  }
2833  
2834                  context.Copy(GetVec(op.Rd), res);
2835              }
2836              else
2837              {
2838                  EmitVectorUnaryOpSx(context, (op1) => context.Negate(op1));
2839              }
2840          }
2841  
2842          public static void Pmull_V(ArmEmitterContext context)
2843          {
2844              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
2845  
2846              if (Optimizations.UseArm64Pmull)
2847              {
2848                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64PmullV);
2849              }
2850              else if (Optimizations.UsePclmulqdq && op.Size == 3)
2851              {
2852                  Operand n = GetVec(op.Rn);
2853                  Operand m = GetVec(op.Rm);
2854  
2855                  int imm8 = op.RegisterSize == RegisterSize.Simd64 ? 0b0000_0000 : 0b0001_0001;
2856  
2857                  Operand res = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, n, m, Const(imm8));
2858  
2859                  context.Copy(GetVec(op.Rd), res);
2860              }
2861              else if (Optimizations.UseSse41)
2862              {
2863                  Operand n = GetVec(op.Rn);
2864                  Operand m = GetVec(op.Rm);
2865  
2866                  if (op.RegisterSize == RegisterSize.Simd64)
2867                  {
2868                      n = context.VectorZeroUpper64(n);
2869                      m = context.VectorZeroUpper64(m);
2870                  }
2871                  else /* if (op.RegisterSize == RegisterSize.Simd128) */
2872                  {
2873                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
2874                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
2875                  }
2876  
2877                  Operand res = context.VectorZero();
2878  
2879                  if (op.Size == 0)
2880                  {
2881                      n = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, n);
2882                      m = context.AddIntrinsic(Intrinsic.X86Pmovzxbw, m);
2883  
2884                      for (int i = 0; i < 8; i++)
2885                      {
2886                          Operand mask = context.AddIntrinsic(Intrinsic.X86Psllw, n, Const(15 - i));
2887                          mask = context.AddIntrinsic(Intrinsic.X86Psraw, mask, Const(15));
2888  
2889                          Operand tmp = context.AddIntrinsic(Intrinsic.X86Psllw, m, Const(i));
2890                          tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
2891  
2892                          res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
2893                      }
2894                  }
2895                  else /* if (op.Size == 3) */
2896                  {
2897                      Operand zero = context.VectorZero();
2898  
2899                      for (int i = 0; i < 64; i++)
2900                      {
2901                          Operand mask = context.AddIntrinsic(Intrinsic.X86Movlhps, n, n);
2902                          mask = context.AddIntrinsic(Intrinsic.X86Psllq, mask, Const(63 - i));
2903                          mask = context.AddIntrinsic(Intrinsic.X86Psrlq, mask, Const(63));
2904                          mask = context.AddIntrinsic(Intrinsic.X86Psubq, zero, mask);
2905  
2906                          Operand tmp = EmitSse2Sll_128(context, m, i);
2907                          tmp = context.AddIntrinsic(Intrinsic.X86Pand, tmp, mask);
2908  
2909                          res = context.AddIntrinsic(Intrinsic.X86Pxor, res, tmp);
2910                      }
2911                  }
2912  
2913                  context.Copy(GetVec(op.Rd), res);
2914              }
2915              else
2916              {
2917                  Operand n = GetVec(op.Rn);
2918                  Operand m = GetVec(op.Rm);
2919  
2920                  Operand res;
2921  
2922                  if (op.Size == 0)
2923                  {
2924                      res = context.VectorZero();
2925  
2926                      int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 8;
2927  
2928                      for (int index = 0; index < 8; index++)
2929                      {
2930                          Operand ne = context.VectorExtract8(n, part + index);
2931                          Operand me = context.VectorExtract8(m, part + index);
2932  
2933                          Operand de = EmitPolynomialMultiply(context, ne, me, 8);
2934  
2935                          res = EmitVectorInsert(context, res, de, index, 1);
2936                      }
2937                  }
2938                  else /* if (op.Size == 3) */
2939                  {
2940                      int part = op.RegisterSize == RegisterSize.Simd64 ? 0 : 1;
2941  
2942                      Operand ne = context.VectorExtract(OperandType.I64, n, part);
2943                      Operand me = context.VectorExtract(OperandType.I64, m, part);
2944  
2945                      res = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.PolynomialMult64_128)), ne, me);
2946                  }
2947  
2948                  context.Copy(GetVec(op.Rd), res);
2949              }
2950          }
2951  
2952          public static void Raddhn_V(ArmEmitterContext context)
2953          {
2954              if (Optimizations.UseAdvSimd)
2955              {
2956                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64RaddhnV);
2957              }
2958              else
2959              {
2960                  EmitHighNarrow(context, (op1, op2) => context.Add(op1, op2), round: true);
2961              }
2962          }
2963  
2964          public static void Rsubhn_V(ArmEmitterContext context)
2965          {
2966              if (Optimizations.UseAdvSimd)
2967              {
2968                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64RsubhnV);
2969              }
2970              else
2971              {
2972                  EmitHighNarrow(context, (op1, op2) => context.Subtract(op1, op2), round: true);
2973              }
2974          }
2975  
2976          public static void Saba_V(ArmEmitterContext context)
2977          {
2978              if (Optimizations.UseAdvSimd)
2979              {
2980                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SabaV);
2981              }
2982              else
2983              {
2984                  EmitVectorTernaryOpSx(context, (op1, op2, op3) =>
2985                  {
2986                      return context.Add(op1, EmitAbs(context, context.Subtract(op2, op3)));
2987                  });
2988              }
2989          }
2990  
2991          public static void Sabal_V(ArmEmitterContext context)
2992          {
2993              if (Optimizations.UseAdvSimd)
2994              {
2995                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SabalV);
2996              }
2997              else
2998              {
2999                  EmitVectorWidenRnRmTernaryOpSx(context, (op1, op2, op3) =>
3000                  {
3001                      return context.Add(op1, EmitAbs(context, context.Subtract(op2, op3)));
3002                  });
3003              }
3004          }
3005  
3006          public static void Sabd_V(ArmEmitterContext context)
3007          {
3008              if (Optimizations.UseAdvSimd)
3009              {
3010                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SabdV);
3011              }
3012              else if (Optimizations.UseSse41)
3013              {
3014                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3015  
3016                  Operand n = GetVec(op.Rn);
3017                  Operand m = GetVec(op.Rm);
3018  
3019                  EmitSse41VectorSabdOp(context, op, n, m, isLong: false);
3020              }
3021              else
3022              {
3023                  EmitVectorBinaryOpSx(context, (op1, op2) =>
3024                  {
3025                      return EmitAbs(context, context.Subtract(op1, op2));
3026                  });
3027              }
3028          }
3029  
3030          public static void Sabdl_V(ArmEmitterContext context)
3031          {
3032              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3033  
3034              if (Optimizations.UseAdvSimd)
3035              {
3036                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SabdlV);
3037              }
3038              else if (Optimizations.UseSse41 && op.Size < 2)
3039              {
3040                  Operand n = GetVec(op.Rn);
3041                  Operand m = GetVec(op.Rm);
3042  
3043                  if (op.RegisterSize == RegisterSize.Simd128)
3044                  {
3045                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
3046                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
3047                  }
3048  
3049                  Intrinsic movInst = op.Size == 0
3050                      ? Intrinsic.X86Pmovsxbw
3051                      : Intrinsic.X86Pmovsxwd;
3052  
3053                  n = context.AddIntrinsic(movInst, n);
3054                  m = context.AddIntrinsic(movInst, m);
3055  
3056                  EmitSse41VectorSabdOp(context, op, n, m, isLong: true);
3057              }
3058              else
3059              {
3060                  EmitVectorWidenRnRmBinaryOpSx(context, (op1, op2) =>
3061                  {
3062                      return EmitAbs(context, context.Subtract(op1, op2));
3063                  });
3064              }
3065          }
3066  
3067          public static void Sadalp_V(ArmEmitterContext context)
3068          {
3069              if (Optimizations.UseAdvSimd)
3070              {
3071                  InstEmitSimdHelperArm64.EmitVectorBinaryOpRd(context, Intrinsic.Arm64SadalpV);
3072              }
3073              else
3074              {
3075                  EmitAddLongPairwise(context, signed: true, accumulate: true);
3076              }
3077          }
3078  
3079          public static void Saddl_V(ArmEmitterContext context)
3080          {
3081              if (Optimizations.UseAdvSimd)
3082              {
3083                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SaddlV);
3084              }
3085              else if (Optimizations.UseSse41)
3086              {
3087                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3088  
3089                  Operand n = GetVec(op.Rn);
3090                  Operand m = GetVec(op.Rm);
3091  
3092                  if (op.RegisterSize == RegisterSize.Simd128)
3093                  {
3094                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
3095                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
3096                  }
3097  
3098                  Intrinsic movInst = X86PmovsxInstruction[op.Size];
3099  
3100                  n = context.AddIntrinsic(movInst, n);
3101                  m = context.AddIntrinsic(movInst, m);
3102  
3103                  Intrinsic addInst = X86PaddInstruction[op.Size + 1];
3104  
3105                  context.Copy(GetVec(op.Rd), context.AddIntrinsic(addInst, n, m));
3106              }
3107              else
3108              {
3109                  EmitVectorWidenRnRmBinaryOpSx(context, (op1, op2) => context.Add(op1, op2));
3110              }
3111          }
3112  
3113          public static void Saddlp_V(ArmEmitterContext context)
3114          {
3115              if (Optimizations.UseAdvSimd)
3116              {
3117                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64SaddlpV);
3118              }
3119              else
3120              {
3121                  EmitAddLongPairwise(context, signed: true, accumulate: false);
3122              }
3123          }
3124  
3125          public static void Saddlv_V(ArmEmitterContext context)
3126          {
3127              if (Optimizations.UseAdvSimd)
3128              {
3129                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64SaddlvV);
3130              }
3131              else
3132              {
3133                  EmitVectorLongAcrossVectorOpSx(context, (op1, op2) => context.Add(op1, op2));
3134              }
3135          }
3136  
3137          public static void Saddw_V(ArmEmitterContext context)
3138          {
3139              if (Optimizations.UseAdvSimd)
3140              {
3141                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SaddwV);
3142              }
3143              else if (Optimizations.UseSse41)
3144              {
3145                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3146  
3147                  Operand n = GetVec(op.Rn);
3148                  Operand m = GetVec(op.Rm);
3149  
3150                  if (op.RegisterSize == RegisterSize.Simd128)
3151                  {
3152                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
3153                  }
3154  
3155                  Intrinsic movInst = X86PmovsxInstruction[op.Size];
3156  
3157                  m = context.AddIntrinsic(movInst, m);
3158  
3159                  Intrinsic addInst = X86PaddInstruction[op.Size + 1];
3160  
3161                  context.Copy(GetVec(op.Rd), context.AddIntrinsic(addInst, n, m));
3162              }
3163              else
3164              {
3165                  EmitVectorWidenRmBinaryOpSx(context, (op1, op2) => context.Add(op1, op2));
3166              }
3167          }
3168  
3169          public static void Shadd_V(ArmEmitterContext context)
3170          {
3171              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3172  
3173              if (Optimizations.UseAdvSimd)
3174              {
3175                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64ShaddV);
3176              }
3177              else if (Optimizations.UseSse2 && op.Size > 0)
3178              {
3179                  Operand n = GetVec(op.Rn);
3180                  Operand m = GetVec(op.Rm);
3181  
3182                  Operand res = context.AddIntrinsic(Intrinsic.X86Pand, n, m);
3183                  Operand res2 = context.AddIntrinsic(Intrinsic.X86Pxor, n, m);
3184  
3185                  Intrinsic shiftInst = op.Size == 1 ? Intrinsic.X86Psraw : Intrinsic.X86Psrad;
3186  
3187                  res2 = context.AddIntrinsic(shiftInst, res2, Const(1));
3188  
3189                  Intrinsic addInst = X86PaddInstruction[op.Size];
3190  
3191                  res = context.AddIntrinsic(addInst, res, res2);
3192  
3193                  if (op.RegisterSize == RegisterSize.Simd64)
3194                  {
3195                      res = context.VectorZeroUpper64(res);
3196                  }
3197  
3198                  context.Copy(GetVec(op.Rd), res);
3199              }
3200              else
3201              {
3202                  EmitVectorBinaryOpSx(context, (op1, op2) =>
3203                  {
3204                      return context.ShiftRightSI(context.Add(op1, op2), Const(1));
3205                  });
3206              }
3207          }
3208  
3209          public static void Shsub_V(ArmEmitterContext context)
3210          {
3211              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3212  
3213              if (Optimizations.UseAdvSimd)
3214              {
3215                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64ShsubV);
3216              }
3217              else if (Optimizations.UseSse2 && op.Size < 2)
3218              {
3219                  Operand n = GetVec(op.Rn);
3220                  Operand m = GetVec(op.Rm);
3221  
3222                  Operand mask = X86GetAllElements(context, (int)(op.Size == 0 ? 0x80808080u : 0x80008000u));
3223  
3224                  Intrinsic addInst = X86PaddInstruction[op.Size];
3225  
3226                  Operand nPlusMask = context.AddIntrinsic(addInst, n, mask);
3227                  Operand mPlusMask = context.AddIntrinsic(addInst, m, mask);
3228  
3229                  Intrinsic avgInst = op.Size == 0 ? Intrinsic.X86Pavgb : Intrinsic.X86Pavgw;
3230  
3231                  Operand res = context.AddIntrinsic(avgInst, nPlusMask, mPlusMask);
3232  
3233                  Intrinsic subInst = X86PsubInstruction[op.Size];
3234  
3235                  res = context.AddIntrinsic(subInst, nPlusMask, res);
3236  
3237                  if (op.RegisterSize == RegisterSize.Simd64)
3238                  {
3239                      res = context.VectorZeroUpper64(res);
3240                  }
3241  
3242                  context.Copy(GetVec(op.Rd), res);
3243              }
3244              else
3245              {
3246                  EmitVectorBinaryOpSx(context, (op1, op2) =>
3247                  {
3248                      return context.ShiftRightSI(context.Subtract(op1, op2), Const(1));
3249                  });
3250              }
3251          }
3252  
3253          public static void Smax_V(ArmEmitterContext context)
3254          {
3255              if (Optimizations.UseAdvSimd)
3256              {
3257                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SmaxV);
3258              }
3259              else if (Optimizations.UseSse41)
3260              {
3261                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3262  
3263                  Operand n = GetVec(op.Rn);
3264                  Operand m = GetVec(op.Rm);
3265  
3266                  Intrinsic maxInst = X86PmaxsInstruction[op.Size];
3267  
3268                  Operand res = context.AddIntrinsic(maxInst, n, m);
3269  
3270                  if (op.RegisterSize == RegisterSize.Simd64)
3271                  {
3272                      res = context.VectorZeroUpper64(res);
3273                  }
3274  
3275                  context.Copy(GetVec(op.Rd), res);
3276              }
3277              else
3278              {
3279                  EmitVectorBinaryOpSx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: true));
3280              }
3281          }
3282  
3283          public static void Smaxp_V(ArmEmitterContext context)
3284          {
3285              if (Optimizations.UseAdvSimd)
3286              {
3287                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SmaxpV);
3288              }
3289              else if (Optimizations.UseSsse3)
3290              {
3291                  EmitSsse3VectorPairwiseOp(context, X86PmaxsInstruction);
3292              }
3293              else
3294              {
3295                  EmitVectorPairwiseOpSx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: true));
3296              }
3297          }
3298  
3299          public static void Smaxv_V(ArmEmitterContext context)
3300          {
3301              if (Optimizations.UseAdvSimd)
3302              {
3303                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64SmaxvV);
3304              }
3305              else
3306              {
3307                  EmitVectorAcrossVectorOpSx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: true));
3308              }
3309          }
3310  
3311          public static void Smin_V(ArmEmitterContext context)
3312          {
3313              if (Optimizations.UseAdvSimd)
3314              {
3315                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SminV);
3316              }
3317              else if (Optimizations.UseSse41)
3318              {
3319                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3320  
3321                  Operand n = GetVec(op.Rn);
3322                  Operand m = GetVec(op.Rm);
3323  
3324                  Intrinsic minInst = X86PminsInstruction[op.Size];
3325  
3326                  Operand res = context.AddIntrinsic(minInst, n, m);
3327  
3328                  if (op.RegisterSize == RegisterSize.Simd64)
3329                  {
3330                      res = context.VectorZeroUpper64(res);
3331                  }
3332  
3333                  context.Copy(GetVec(op.Rd), res);
3334              }
3335              else
3336              {
3337                  EmitVectorBinaryOpSx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: true));
3338              }
3339          }
3340  
3341          public static void Sminp_V(ArmEmitterContext context)
3342          {
3343              if (Optimizations.UseAdvSimd)
3344              {
3345                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SminpV);
3346              }
3347              else if (Optimizations.UseSsse3)
3348              {
3349                  EmitSsse3VectorPairwiseOp(context, X86PminsInstruction);
3350              }
3351              else
3352              {
3353                  EmitVectorPairwiseOpSx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: true));
3354              }
3355          }
3356  
3357          public static void Sminv_V(ArmEmitterContext context)
3358          {
3359              if (Optimizations.UseAdvSimd)
3360              {
3361                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64SminvV);
3362              }
3363              else
3364              {
3365                  EmitVectorAcrossVectorOpSx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: true));
3366              }
3367          }
3368  
3369          public static void Smlal_V(ArmEmitterContext context)
3370          {
3371              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3372  
3373              if (Optimizations.UseAdvSimd)
3374              {
3375                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SmlalV);
3376              }
3377              else if (Optimizations.UseSse41 && op.Size < 2)
3378              {
3379                  Operand d = GetVec(op.Rd);
3380                  Operand n = GetVec(op.Rn);
3381                  Operand m = GetVec(op.Rm);
3382  
3383                  if (op.RegisterSize == RegisterSize.Simd128)
3384                  {
3385                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
3386                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
3387                  }
3388  
3389                  Intrinsic movInst = X86PmovsxInstruction[op.Size];
3390  
3391                  n = context.AddIntrinsic(movInst, n);
3392                  m = context.AddIntrinsic(movInst, m);
3393  
3394                  Intrinsic mullInst = op.Size == 0 ? Intrinsic.X86Pmullw : Intrinsic.X86Pmulld;
3395  
3396                  Operand res = context.AddIntrinsic(mullInst, n, m);
3397  
3398                  Intrinsic addInst = X86PaddInstruction[op.Size + 1];
3399  
3400                  context.Copy(d, context.AddIntrinsic(addInst, d, res));
3401              }
3402              else
3403              {
3404                  EmitVectorWidenRnRmTernaryOpSx(context, (op1, op2, op3) =>
3405                  {
3406                      return context.Add(op1, context.Multiply(op2, op3));
3407                  });
3408              }
3409          }
3410  
3411          public static void Smlal_Ve(ArmEmitterContext context)
3412          {
3413              if (Optimizations.UseAdvSimd)
3414              {
3415                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64SmlalVe);
3416              }
3417              else
3418              {
3419                  EmitVectorWidenTernaryOpByElemSx(context, (op1, op2, op3) =>
3420                  {
3421                      return context.Add(op1, context.Multiply(op2, op3));
3422                  });
3423              }
3424          }
3425  
3426          public static void Smlsl_V(ArmEmitterContext context)
3427          {
3428              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3429  
3430              if (Optimizations.UseAdvSimd)
3431              {
3432                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SmlslV);
3433              }
3434              else if (Optimizations.UseSse41 && op.Size < 2)
3435              {
3436                  Operand d = GetVec(op.Rd);
3437                  Operand n = GetVec(op.Rn);
3438                  Operand m = GetVec(op.Rm);
3439  
3440                  if (op.RegisterSize == RegisterSize.Simd128)
3441                  {
3442                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
3443                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
3444                  }
3445  
3446                  Intrinsic movInst = op.Size == 0 ? Intrinsic.X86Pmovsxbw : Intrinsic.X86Pmovsxwd;
3447  
3448                  n = context.AddIntrinsic(movInst, n);
3449                  m = context.AddIntrinsic(movInst, m);
3450  
3451                  Intrinsic mullInst = op.Size == 0 ? Intrinsic.X86Pmullw : Intrinsic.X86Pmulld;
3452  
3453                  Operand res = context.AddIntrinsic(mullInst, n, m);
3454  
3455                  Intrinsic subInst = X86PsubInstruction[op.Size + 1];
3456  
3457                  context.Copy(d, context.AddIntrinsic(subInst, d, res));
3458              }
3459              else
3460              {
3461                  EmitVectorWidenRnRmTernaryOpSx(context, (op1, op2, op3) =>
3462                  {
3463                      return context.Subtract(op1, context.Multiply(op2, op3));
3464                  });
3465              }
3466          }
3467  
3468          public static void Smlsl_Ve(ArmEmitterContext context)
3469          {
3470              if (Optimizations.UseAdvSimd)
3471              {
3472                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64SmlslVe);
3473              }
3474              else
3475              {
3476                  EmitVectorWidenTernaryOpByElemSx(context, (op1, op2, op3) =>
3477                  {
3478                      return context.Subtract(op1, context.Multiply(op2, op3));
3479                  });
3480              }
3481          }
3482  
3483          public static void Smull_V(ArmEmitterContext context)
3484          {
3485              if (Optimizations.UseAdvSimd)
3486              {
3487                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SmullV);
3488              }
3489              else
3490              {
3491                  EmitVectorWidenRnRmBinaryOpSx(context, (op1, op2) => context.Multiply(op1, op2));
3492              }
3493          }
3494  
3495          public static void Smull_Ve(ArmEmitterContext context)
3496          {
3497              if (Optimizations.UseAdvSimd)
3498              {
3499                  InstEmitSimdHelperArm64.EmitVectorBinaryOpByElem(context, Intrinsic.Arm64SmullVe);
3500              }
3501              else
3502              {
3503                  EmitVectorWidenBinaryOpByElemSx(context, (op1, op2) => context.Multiply(op1, op2));
3504              }
3505          }
3506  
3507          public static void Sqabs_S(ArmEmitterContext context)
3508          {
3509              if (Optimizations.UseAdvSimd)
3510              {
3511                  InstEmitSimdHelperArm64.EmitScalarSaturatingUnaryOp(context, Intrinsic.Arm64SqabsS);
3512              }
3513              else
3514              {
3515                  EmitScalarSaturatingUnaryOpSx(context, (op1) => EmitAbs(context, op1));
3516              }
3517          }
3518  
3519          public static void Sqabs_V(ArmEmitterContext context)
3520          {
3521              if (Optimizations.UseAdvSimd)
3522              {
3523                  InstEmitSimdHelperArm64.EmitVectorSaturatingUnaryOp(context, Intrinsic.Arm64SqabsV);
3524              }
3525              else
3526              {
3527                  EmitVectorSaturatingUnaryOpSx(context, (op1) => EmitAbs(context, op1));
3528              }
3529          }
3530  
3531          public static void Sqadd_S(ArmEmitterContext context)
3532          {
3533              if (Optimizations.UseAdvSimd)
3534              {
3535                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64SqaddS);
3536              }
3537              else
3538              {
3539                  EmitScalarSaturatingBinaryOpSx(context, flags: SaturatingFlags.Add);
3540              }
3541          }
3542  
3543          public static void Sqadd_V(ArmEmitterContext context)
3544          {
3545              if (Optimizations.UseAdvSimd)
3546              {
3547                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqaddV);
3548              }
3549              else
3550              {
3551                  EmitVectorSaturatingBinaryOpSx(context, flags: SaturatingFlags.Add);
3552              }
3553          }
3554  
3555          public static void Sqdmulh_S(ArmEmitterContext context)
3556          {
3557              if (Optimizations.UseAdvSimd)
3558              {
3559                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64SqdmulhS);
3560              }
3561              else
3562              {
3563                  EmitScalarSaturatingBinaryOpSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: false));
3564              }
3565          }
3566  
3567          public static void Sqdmulh_V(ArmEmitterContext context)
3568          {
3569              if (Optimizations.UseAdvSimd)
3570              {
3571                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqdmulhV);
3572              }
3573              else
3574              {
3575                  EmitVectorSaturatingBinaryOpSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: false));
3576              }
3577          }
3578  
3579          public static void Sqdmulh_Ve(ArmEmitterContext context)
3580          {
3581              if (Optimizations.UseAdvSimd)
3582              {
3583                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpByElem(context, Intrinsic.Arm64SqdmulhVe);
3584              }
3585              else
3586              {
3587                  EmitVectorSaturatingBinaryOpByElemSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: false));
3588              }
3589          }
3590  
3591          public static void Sqneg_S(ArmEmitterContext context)
3592          {
3593              if (Optimizations.UseAdvSimd)
3594              {
3595                  InstEmitSimdHelperArm64.EmitScalarSaturatingUnaryOp(context, Intrinsic.Arm64SqnegS);
3596              }
3597              else
3598              {
3599                  EmitScalarSaturatingUnaryOpSx(context, (op1) => context.Negate(op1));
3600              }
3601          }
3602  
3603          public static void Sqneg_V(ArmEmitterContext context)
3604          {
3605              if (Optimizations.UseAdvSimd)
3606              {
3607                  InstEmitSimdHelperArm64.EmitVectorSaturatingUnaryOp(context, Intrinsic.Arm64SqnegV);
3608              }
3609              else
3610              {
3611                  EmitVectorSaturatingUnaryOpSx(context, (op1) => context.Negate(op1));
3612              }
3613          }
3614  
3615          public static void Sqrdmulh_S(ArmEmitterContext context)
3616          {
3617              if (Optimizations.UseAdvSimd)
3618              {
3619                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64SqrdmulhS);
3620              }
3621              else
3622              {
3623                  EmitScalarSaturatingBinaryOpSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: true));
3624              }
3625          }
3626  
3627          public static void Sqrdmulh_V(ArmEmitterContext context)
3628          {
3629              if (Optimizations.UseAdvSimd)
3630              {
3631                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqrdmulhV);
3632              }
3633              else
3634              {
3635                  EmitVectorSaturatingBinaryOpSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: true));
3636              }
3637          }
3638  
3639          public static void Sqrdmulh_Ve(ArmEmitterContext context)
3640          {
3641              if (Optimizations.UseAdvSimd)
3642              {
3643                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpByElem(context, Intrinsic.Arm64SqrdmulhVe);
3644              }
3645              else
3646              {
3647                  EmitVectorSaturatingBinaryOpByElemSx(context, (op1, op2) => EmitDoublingMultiplyHighHalf(context, op1, op2, round: true));
3648              }
3649          }
3650  
3651          public static void Sqsub_S(ArmEmitterContext context)
3652          {
3653              if (Optimizations.UseAdvSimd)
3654              {
3655                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64SqsubS);
3656              }
3657              else
3658              {
3659                  EmitScalarSaturatingBinaryOpSx(context, flags: SaturatingFlags.Sub);
3660              }
3661          }
3662  
3663          public static void Sqsub_V(ArmEmitterContext context)
3664          {
3665              if (Optimizations.UseAdvSimd)
3666              {
3667                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64SqsubV);
3668              }
3669              else
3670              {
3671                  EmitVectorSaturatingBinaryOpSx(context, flags: SaturatingFlags.Sub);
3672              }
3673          }
3674  
3675          public static void Sqxtn_S(ArmEmitterContext context)
3676          {
3677              if (Optimizations.UseAdvSimd)
3678              {
3679                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64SqxtnS);
3680              }
3681              else
3682              {
3683                  EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarSxSx);
3684              }
3685          }
3686  
3687          public static void Sqxtn_V(ArmEmitterContext context)
3688          {
3689              if (Optimizations.UseAdvSimd)
3690              {
3691                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64SqxtnV);
3692              }
3693              else
3694              {
3695                  EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorSxSx);
3696              }
3697          }
3698  
3699          public static void Sqxtun_S(ArmEmitterContext context)
3700          {
3701              if (Optimizations.UseAdvSimd)
3702              {
3703                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64SqxtunS);
3704              }
3705              else
3706              {
3707                  EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarSxZx);
3708              }
3709          }
3710  
3711          public static void Sqxtun_V(ArmEmitterContext context)
3712          {
3713              if (Optimizations.UseAdvSimd)
3714              {
3715                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64SqxtunV);
3716              }
3717              else
3718              {
3719                  EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorSxZx);
3720              }
3721          }
3722  
3723          public static void Srhadd_V(ArmEmitterContext context)
3724          {
3725              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3726  
3727              if (Optimizations.UseAdvSimd)
3728              {
3729                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SrhaddV);
3730              }
3731              else if (Optimizations.UseSse2 && op.Size < 2)
3732              {
3733                  Operand n = GetVec(op.Rn);
3734                  Operand m = GetVec(op.Rm);
3735  
3736                  Operand mask = X86GetAllElements(context, (int)(op.Size == 0 ? 0x80808080u : 0x80008000u));
3737  
3738                  Intrinsic subInst = X86PsubInstruction[op.Size];
3739  
3740                  Operand nMinusMask = context.AddIntrinsic(subInst, n, mask);
3741                  Operand mMinusMask = context.AddIntrinsic(subInst, m, mask);
3742  
3743                  Intrinsic avgInst = op.Size == 0 ? Intrinsic.X86Pavgb : Intrinsic.X86Pavgw;
3744  
3745                  Operand res = context.AddIntrinsic(avgInst, nMinusMask, mMinusMask);
3746  
3747                  Intrinsic addInst = X86PaddInstruction[op.Size];
3748  
3749                  res = context.AddIntrinsic(addInst, mask, res);
3750  
3751                  if (op.RegisterSize == RegisterSize.Simd64)
3752                  {
3753                      res = context.VectorZeroUpper64(res);
3754                  }
3755  
3756                  context.Copy(GetVec(op.Rd), res);
3757              }
3758              else
3759              {
3760                  EmitVectorBinaryOpSx(context, (op1, op2) =>
3761                  {
3762                      Operand res = context.Add(op1, op2);
3763  
3764                      res = context.Add(res, Const(1L));
3765  
3766                      return context.ShiftRightSI(res, Const(1));
3767                  });
3768              }
3769          }
3770  
3771          public static void Ssubl_V(ArmEmitterContext context)
3772          {
3773              if (Optimizations.UseAdvSimd)
3774              {
3775                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SsublV);
3776              }
3777              else if (Optimizations.UseSse41)
3778              {
3779                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3780  
3781                  Operand n = GetVec(op.Rn);
3782                  Operand m = GetVec(op.Rm);
3783  
3784                  if (op.RegisterSize == RegisterSize.Simd128)
3785                  {
3786                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
3787                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
3788                  }
3789  
3790                  Intrinsic movInst = X86PmovsxInstruction[op.Size];
3791  
3792                  n = context.AddIntrinsic(movInst, n);
3793                  m = context.AddIntrinsic(movInst, m);
3794  
3795                  Intrinsic subInst = X86PsubInstruction[op.Size + 1];
3796  
3797                  context.Copy(GetVec(op.Rd), context.AddIntrinsic(subInst, n, m));
3798              }
3799              else
3800              {
3801                  EmitVectorWidenRnRmBinaryOpSx(context, (op1, op2) => context.Subtract(op1, op2));
3802              }
3803          }
3804  
3805          public static void Ssubw_V(ArmEmitterContext context)
3806          {
3807              if (Optimizations.UseAdvSimd)
3808              {
3809                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SsubwV);
3810              }
3811              else if (Optimizations.UseSse41)
3812              {
3813                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3814  
3815                  Operand n = GetVec(op.Rn);
3816                  Operand m = GetVec(op.Rm);
3817  
3818                  if (op.RegisterSize == RegisterSize.Simd128)
3819                  {
3820                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
3821                  }
3822  
3823                  Intrinsic movInst = X86PmovsxInstruction[op.Size];
3824  
3825                  m = context.AddIntrinsic(movInst, m);
3826  
3827                  Intrinsic subInst = X86PsubInstruction[op.Size + 1];
3828  
3829                  context.Copy(GetVec(op.Rd), context.AddIntrinsic(subInst, n, m));
3830              }
3831              else
3832              {
3833                  EmitVectorWidenRmBinaryOpSx(context, (op1, op2) => context.Subtract(op1, op2));
3834              }
3835          }
3836  
3837          public static void Sub_S(ArmEmitterContext context)
3838          {
3839              if (Optimizations.UseAdvSimd)
3840              {
3841                  InstEmitSimdHelperArm64.EmitScalarBinaryOp(context, Intrinsic.Arm64SubS);
3842              }
3843              else
3844              {
3845                  EmitScalarBinaryOpZx(context, (op1, op2) => context.Subtract(op1, op2));
3846              }
3847          }
3848  
3849          public static void Sub_V(ArmEmitterContext context)
3850          {
3851              if (Optimizations.UseAdvSimd)
3852              {
3853                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64SubV);
3854              }
3855              else if (Optimizations.UseSse2)
3856              {
3857                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3858  
3859                  Operand n = GetVec(op.Rn);
3860                  Operand m = GetVec(op.Rm);
3861  
3862                  Intrinsic subInst = X86PsubInstruction[op.Size];
3863  
3864                  Operand res = context.AddIntrinsic(subInst, n, m);
3865  
3866                  if (op.RegisterSize == RegisterSize.Simd64)
3867                  {
3868                      res = context.VectorZeroUpper64(res);
3869                  }
3870  
3871                  context.Copy(GetVec(op.Rd), res);
3872              }
3873              else
3874              {
3875                  EmitVectorBinaryOpZx(context, (op1, op2) => context.Subtract(op1, op2));
3876              }
3877          }
3878  
3879          public static void Subhn_V(ArmEmitterContext context)
3880          {
3881              if (Optimizations.UseAdvSimd)
3882              {
3883                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64SubhnV);
3884              }
3885              else
3886              {
3887                  EmitHighNarrow(context, (op1, op2) => context.Subtract(op1, op2), round: false);
3888              }
3889          }
3890  
3891          public static void Suqadd_S(ArmEmitterContext context)
3892          {
3893              if (Optimizations.UseAdvSimd)
3894              {
3895                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64SuqaddS);
3896              }
3897              else
3898              {
3899                  EmitScalarSaturatingBinaryOpSx(context, flags: SaturatingFlags.Accumulate);
3900              }
3901          }
3902  
3903          public static void Suqadd_V(ArmEmitterContext context)
3904          {
3905              if (Optimizations.UseAdvSimd)
3906              {
3907                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64SuqaddV);
3908              }
3909              else
3910              {
3911                  EmitVectorSaturatingBinaryOpSx(context, flags: SaturatingFlags.Accumulate);
3912              }
3913          }
3914  
3915          public static void Uaba_V(ArmEmitterContext context)
3916          {
3917              if (Optimizations.UseAdvSimd)
3918              {
3919                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64UabaV);
3920              }
3921              else
3922              {
3923                  EmitVectorTernaryOpZx(context, (op1, op2, op3) =>
3924                  {
3925                      return context.Add(op1, EmitAbs(context, context.Subtract(op2, op3)));
3926                  });
3927              }
3928          }
3929  
3930          public static void Uabal_V(ArmEmitterContext context)
3931          {
3932              if (Optimizations.UseAdvSimd)
3933              {
3934                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64UabalV);
3935              }
3936              else
3937              {
3938                  EmitVectorWidenRnRmTernaryOpZx(context, (op1, op2, op3) =>
3939                  {
3940                      return context.Add(op1, EmitAbs(context, context.Subtract(op2, op3)));
3941                  });
3942              }
3943          }
3944  
3945          public static void Uabd_V(ArmEmitterContext context)
3946          {
3947              if (Optimizations.UseAdvSimd)
3948              {
3949                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UabdV);
3950              }
3951              else if (Optimizations.UseSse41)
3952              {
3953                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3954  
3955                  Operand n = GetVec(op.Rn);
3956                  Operand m = GetVec(op.Rm);
3957  
3958                  EmitSse41VectorUabdOp(context, op, n, m, isLong: false);
3959              }
3960              else
3961              {
3962                  EmitVectorBinaryOpZx(context, (op1, op2) =>
3963                  {
3964                      return EmitAbs(context, context.Subtract(op1, op2));
3965                  });
3966              }
3967          }
3968  
3969          public static void Uabdl_V(ArmEmitterContext context)
3970          {
3971              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
3972  
3973              if (Optimizations.UseAdvSimd)
3974              {
3975                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UabdlV);
3976              }
3977              else if (Optimizations.UseSse41 && op.Size < 2)
3978              {
3979                  Operand n = GetVec(op.Rn);
3980                  Operand m = GetVec(op.Rm);
3981  
3982                  if (op.RegisterSize == RegisterSize.Simd128)
3983                  {
3984                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
3985                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
3986                  }
3987  
3988                  Intrinsic movInst = op.Size == 0
3989                      ? Intrinsic.X86Pmovzxbw
3990                      : Intrinsic.X86Pmovzxwd;
3991  
3992                  n = context.AddIntrinsic(movInst, n);
3993                  m = context.AddIntrinsic(movInst, m);
3994  
3995                  EmitSse41VectorUabdOp(context, op, n, m, isLong: true);
3996              }
3997              else
3998              {
3999                  EmitVectorWidenRnRmBinaryOpZx(context, (op1, op2) =>
4000                  {
4001                      return EmitAbs(context, context.Subtract(op1, op2));
4002                  });
4003              }
4004          }
4005  
4006          public static void Uadalp_V(ArmEmitterContext context)
4007          {
4008              if (Optimizations.UseAdvSimd)
4009              {
4010                  InstEmitSimdHelperArm64.EmitVectorBinaryOpRd(context, Intrinsic.Arm64UadalpV);
4011              }
4012              else
4013              {
4014                  EmitAddLongPairwise(context, signed: false, accumulate: true);
4015              }
4016          }
4017  
4018          public static void Uaddl_V(ArmEmitterContext context)
4019          {
4020              if (Optimizations.UseAdvSimd)
4021              {
4022                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UaddlV);
4023              }
4024              else if (Optimizations.UseSse41)
4025              {
4026                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4027  
4028                  Operand n = GetVec(op.Rn);
4029                  Operand m = GetVec(op.Rm);
4030  
4031                  if (op.RegisterSize == RegisterSize.Simd128)
4032                  {
4033                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
4034                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
4035                  }
4036  
4037                  Intrinsic movInst = X86PmovzxInstruction[op.Size];
4038  
4039                  n = context.AddIntrinsic(movInst, n);
4040                  m = context.AddIntrinsic(movInst, m);
4041  
4042                  Intrinsic addInst = X86PaddInstruction[op.Size + 1];
4043  
4044                  context.Copy(GetVec(op.Rd), context.AddIntrinsic(addInst, n, m));
4045              }
4046              else
4047              {
4048                  EmitVectorWidenRnRmBinaryOpZx(context, (op1, op2) => context.Add(op1, op2));
4049              }
4050          }
4051  
4052          public static void Uaddlp_V(ArmEmitterContext context)
4053          {
4054              if (Optimizations.UseAdvSimd)
4055              {
4056                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64UaddlpV);
4057              }
4058              else
4059              {
4060                  EmitAddLongPairwise(context, signed: false, accumulate: false);
4061              }
4062          }
4063  
4064          public static void Uaddlv_V(ArmEmitterContext context)
4065          {
4066              if (Optimizations.UseAdvSimd)
4067              {
4068                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64UaddlvV);
4069              }
4070              else
4071              {
4072                  EmitVectorLongAcrossVectorOpZx(context, (op1, op2) => context.Add(op1, op2));
4073              }
4074          }
4075  
4076          public static void Uaddw_V(ArmEmitterContext context)
4077          {
4078              if (Optimizations.UseAdvSimd)
4079              {
4080                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UaddwV);
4081              }
4082              else if (Optimizations.UseSse41)
4083              {
4084                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4085  
4086                  Operand n = GetVec(op.Rn);
4087                  Operand m = GetVec(op.Rm);
4088  
4089                  if (op.RegisterSize == RegisterSize.Simd128)
4090                  {
4091                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
4092                  }
4093  
4094                  Intrinsic movInst = X86PmovzxInstruction[op.Size];
4095  
4096                  m = context.AddIntrinsic(movInst, m);
4097  
4098                  Intrinsic addInst = X86PaddInstruction[op.Size + 1];
4099  
4100                  context.Copy(GetVec(op.Rd), context.AddIntrinsic(addInst, n, m));
4101              }
4102              else
4103              {
4104                  EmitVectorWidenRmBinaryOpZx(context, (op1, op2) => context.Add(op1, op2));
4105              }
4106          }
4107  
4108          public static void Uhadd_V(ArmEmitterContext context)
4109          {
4110              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4111  
4112              if (Optimizations.UseAdvSimd)
4113              {
4114                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UhaddV);
4115              }
4116              else if (Optimizations.UseSse2 && op.Size > 0)
4117              {
4118                  Operand n = GetVec(op.Rn);
4119                  Operand m = GetVec(op.Rm);
4120  
4121                  Operand res = context.AddIntrinsic(Intrinsic.X86Pand, n, m);
4122                  Operand res2 = context.AddIntrinsic(Intrinsic.X86Pxor, n, m);
4123  
4124                  Intrinsic shiftInst = op.Size == 1 ? Intrinsic.X86Psrlw : Intrinsic.X86Psrld;
4125  
4126                  res2 = context.AddIntrinsic(shiftInst, res2, Const(1));
4127  
4128                  Intrinsic addInst = X86PaddInstruction[op.Size];
4129  
4130                  res = context.AddIntrinsic(addInst, res, res2);
4131  
4132                  if (op.RegisterSize == RegisterSize.Simd64)
4133                  {
4134                      res = context.VectorZeroUpper64(res);
4135                  }
4136  
4137                  context.Copy(GetVec(op.Rd), res);
4138              }
4139              else
4140              {
4141                  EmitVectorBinaryOpZx(context, (op1, op2) =>
4142                  {
4143                      return context.ShiftRightUI(context.Add(op1, op2), Const(1));
4144                  });
4145              }
4146          }
4147  
4148          public static void Uhsub_V(ArmEmitterContext context)
4149          {
4150              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4151  
4152              if (Optimizations.UseAdvSimd)
4153              {
4154                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UhsubV);
4155              }
4156              else if (Optimizations.UseSse2 && op.Size < 2)
4157              {
4158                  Operand n = GetVec(op.Rn);
4159                  Operand m = GetVec(op.Rm);
4160  
4161                  Intrinsic avgInst = op.Size == 0 ? Intrinsic.X86Pavgb : Intrinsic.X86Pavgw;
4162  
4163                  Operand res = context.AddIntrinsic(avgInst, n, m);
4164  
4165                  Intrinsic subInst = X86PsubInstruction[op.Size];
4166  
4167                  res = context.AddIntrinsic(subInst, n, res);
4168  
4169                  if (op.RegisterSize == RegisterSize.Simd64)
4170                  {
4171                      res = context.VectorZeroUpper64(res);
4172                  }
4173  
4174                  context.Copy(GetVec(op.Rd), res);
4175              }
4176              else
4177              {
4178                  EmitVectorBinaryOpZx(context, (op1, op2) =>
4179                  {
4180                      return context.ShiftRightUI(context.Subtract(op1, op2), Const(1));
4181                  });
4182              }
4183          }
4184  
4185          public static void Umax_V(ArmEmitterContext context)
4186          {
4187              if (Optimizations.UseAdvSimd)
4188              {
4189                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UmaxV);
4190              }
4191              else if (Optimizations.UseSse41)
4192              {
4193                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4194  
4195                  Operand n = GetVec(op.Rn);
4196                  Operand m = GetVec(op.Rm);
4197  
4198                  Intrinsic maxInst = X86PmaxuInstruction[op.Size];
4199  
4200                  Operand res = context.AddIntrinsic(maxInst, n, m);
4201  
4202                  if (op.RegisterSize == RegisterSize.Simd64)
4203                  {
4204                      res = context.VectorZeroUpper64(res);
4205                  }
4206  
4207                  context.Copy(GetVec(op.Rd), res);
4208              }
4209              else
4210              {
4211                  EmitVectorBinaryOpZx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: false));
4212              }
4213          }
4214  
4215          public static void Umaxp_V(ArmEmitterContext context)
4216          {
4217              if (Optimizations.UseAdvSimd)
4218              {
4219                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UmaxpV);
4220              }
4221              else if (Optimizations.UseSsse3)
4222              {
4223                  EmitSsse3VectorPairwiseOp(context, X86PmaxuInstruction);
4224              }
4225              else
4226              {
4227                  EmitVectorPairwiseOpZx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: false));
4228              }
4229          }
4230  
4231          public static void Umaxv_V(ArmEmitterContext context)
4232          {
4233              if (Optimizations.UseAdvSimd)
4234              {
4235                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64UmaxvV);
4236              }
4237              else
4238              {
4239                  EmitVectorAcrossVectorOpZx(context, (op1, op2) => EmitMax64Op(context, op1, op2, signed: false));
4240              }
4241          }
4242  
4243          public static void Umin_V(ArmEmitterContext context)
4244          {
4245              if (Optimizations.UseAdvSimd)
4246              {
4247                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UminV);
4248              }
4249              else if (Optimizations.UseSse41)
4250              {
4251                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4252  
4253                  Operand n = GetVec(op.Rn);
4254                  Operand m = GetVec(op.Rm);
4255  
4256                  Intrinsic minInst = X86PminuInstruction[op.Size];
4257  
4258                  Operand res = context.AddIntrinsic(minInst, n, m);
4259  
4260                  if (op.RegisterSize == RegisterSize.Simd64)
4261                  {
4262                      res = context.VectorZeroUpper64(res);
4263                  }
4264  
4265                  context.Copy(GetVec(op.Rd), res);
4266              }
4267              else
4268              {
4269                  EmitVectorBinaryOpZx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: false));
4270              }
4271          }
4272  
4273          public static void Uminp_V(ArmEmitterContext context)
4274          {
4275              if (Optimizations.UseAdvSimd)
4276              {
4277                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UminpV);
4278              }
4279              else if (Optimizations.UseSsse3)
4280              {
4281                  EmitSsse3VectorPairwiseOp(context, X86PminuInstruction);
4282              }
4283              else
4284              {
4285                  EmitVectorPairwiseOpZx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: false));
4286              }
4287          }
4288  
4289          public static void Uminv_V(ArmEmitterContext context)
4290          {
4291              if (Optimizations.UseAdvSimd)
4292              {
4293                  InstEmitSimdHelperArm64.EmitVectorUnaryOp(context, Intrinsic.Arm64UminvV);
4294              }
4295              else
4296              {
4297                  EmitVectorAcrossVectorOpZx(context, (op1, op2) => EmitMin64Op(context, op1, op2, signed: false));
4298              }
4299          }
4300  
4301          public static void Umlal_V(ArmEmitterContext context)
4302          {
4303              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4304  
4305              if (Optimizations.UseAdvSimd)
4306              {
4307                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64UmlalV);
4308              }
4309              else if (Optimizations.UseSse41 && op.Size < 2)
4310              {
4311                  Operand d = GetVec(op.Rd);
4312                  Operand n = GetVec(op.Rn);
4313                  Operand m = GetVec(op.Rm);
4314  
4315                  if (op.RegisterSize == RegisterSize.Simd128)
4316                  {
4317                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
4318                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
4319                  }
4320  
4321                  Intrinsic movInst = X86PmovzxInstruction[op.Size];
4322  
4323                  n = context.AddIntrinsic(movInst, n);
4324                  m = context.AddIntrinsic(movInst, m);
4325  
4326                  Intrinsic mullInst = op.Size == 0 ? Intrinsic.X86Pmullw : Intrinsic.X86Pmulld;
4327  
4328                  Operand res = context.AddIntrinsic(mullInst, n, m);
4329  
4330                  Intrinsic addInst = X86PaddInstruction[op.Size + 1];
4331  
4332                  context.Copy(d, context.AddIntrinsic(addInst, d, res));
4333              }
4334              else
4335              {
4336                  EmitVectorWidenRnRmTernaryOpZx(context, (op1, op2, op3) =>
4337                  {
4338                      return context.Add(op1, context.Multiply(op2, op3));
4339                  });
4340              }
4341          }
4342  
4343          public static void Umlal_Ve(ArmEmitterContext context)
4344          {
4345              if (Optimizations.UseAdvSimd)
4346              {
4347                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64UmlalVe);
4348              }
4349              else
4350              {
4351                  EmitVectorWidenTernaryOpByElemZx(context, (op1, op2, op3) =>
4352                  {
4353                      return context.Add(op1, context.Multiply(op2, op3));
4354                  });
4355              }
4356          }
4357  
4358          public static void Umlsl_V(ArmEmitterContext context)
4359          {
4360              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4361  
4362              if (Optimizations.UseAdvSimd)
4363              {
4364                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRd(context, Intrinsic.Arm64UmlslV);
4365              }
4366              else if (Optimizations.UseSse41 && op.Size < 2)
4367              {
4368                  Operand d = GetVec(op.Rd);
4369                  Operand n = GetVec(op.Rn);
4370                  Operand m = GetVec(op.Rm);
4371  
4372                  if (op.RegisterSize == RegisterSize.Simd128)
4373                  {
4374                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
4375                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
4376                  }
4377  
4378                  Intrinsic movInst = op.Size == 0 ? Intrinsic.X86Pmovzxbw : Intrinsic.X86Pmovzxwd;
4379  
4380                  n = context.AddIntrinsic(movInst, n);
4381                  m = context.AddIntrinsic(movInst, m);
4382  
4383                  Intrinsic mullInst = op.Size == 0 ? Intrinsic.X86Pmullw : Intrinsic.X86Pmulld;
4384  
4385                  Operand res = context.AddIntrinsic(mullInst, n, m);
4386  
4387                  Intrinsic subInst = X86PsubInstruction[op.Size + 1];
4388  
4389                  context.Copy(d, context.AddIntrinsic(subInst, d, res));
4390              }
4391              else
4392              {
4393                  EmitVectorWidenRnRmTernaryOpZx(context, (op1, op2, op3) =>
4394                  {
4395                      return context.Subtract(op1, context.Multiply(op2, op3));
4396                  });
4397              }
4398          }
4399  
4400          public static void Umlsl_Ve(ArmEmitterContext context)
4401          {
4402              if (Optimizations.UseAdvSimd)
4403              {
4404                  InstEmitSimdHelperArm64.EmitVectorTernaryOpRdByElem(context, Intrinsic.Arm64UmlslVe);
4405              }
4406              else
4407              {
4408                  EmitVectorWidenTernaryOpByElemZx(context, (op1, op2, op3) =>
4409                  {
4410                      return context.Subtract(op1, context.Multiply(op2, op3));
4411                  });
4412              }
4413          }
4414  
4415          public static void Umull_V(ArmEmitterContext context)
4416          {
4417              if (Optimizations.UseAdvSimd)
4418              {
4419                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UmullV);
4420              }
4421              else
4422              {
4423                  EmitVectorWidenRnRmBinaryOpZx(context, (op1, op2) => context.Multiply(op1, op2));
4424              }
4425          }
4426  
4427          public static void Umull_Ve(ArmEmitterContext context)
4428          {
4429              if (Optimizations.UseAdvSimd)
4430              {
4431                  InstEmitSimdHelperArm64.EmitVectorBinaryOpByElem(context, Intrinsic.Arm64UmullVe);
4432              }
4433              else
4434              {
4435                  EmitVectorWidenBinaryOpByElemZx(context, (op1, op2) => context.Multiply(op1, op2));
4436              }
4437          }
4438  
4439          public static void Uqadd_S(ArmEmitterContext context)
4440          {
4441              if (Optimizations.UseAdvSimd)
4442              {
4443                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64UqaddS);
4444              }
4445              else
4446              {
4447                  EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Add);
4448              }
4449          }
4450  
4451          public static void Uqadd_V(ArmEmitterContext context)
4452          {
4453              if (Optimizations.UseAdvSimd)
4454              {
4455                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64UqaddV);
4456              }
4457              else
4458              {
4459                  EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Add);
4460              }
4461          }
4462  
4463          public static void Uqsub_S(ArmEmitterContext context)
4464          {
4465              if (Optimizations.UseAdvSimd)
4466              {
4467                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOp(context, Intrinsic.Arm64UqsubS);
4468              }
4469              else
4470              {
4471                  EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Sub);
4472              }
4473          }
4474  
4475          public static void Uqsub_V(ArmEmitterContext context)
4476          {
4477              if (Optimizations.UseAdvSimd)
4478              {
4479                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOp(context, Intrinsic.Arm64UqsubV);
4480              }
4481              else
4482              {
4483                  EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Sub);
4484              }
4485          }
4486  
4487          public static void Uqxtn_S(ArmEmitterContext context)
4488          {
4489              if (Optimizations.UseAdvSimd)
4490              {
4491                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64UqxtnS);
4492              }
4493              else
4494              {
4495                  EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.ScalarZxZx);
4496              }
4497          }
4498  
4499          public static void Uqxtn_V(ArmEmitterContext context)
4500          {
4501              if (Optimizations.UseAdvSimd)
4502              {
4503                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64UqxtnV);
4504              }
4505              else
4506              {
4507                  EmitSaturatingNarrowOp(context, SaturatingNarrowFlags.VectorZxZx);
4508              }
4509          }
4510  
4511          public static void Urhadd_V(ArmEmitterContext context)
4512          {
4513              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4514  
4515              if (Optimizations.UseAdvSimd)
4516              {
4517                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UrhaddV);
4518              }
4519              else if (Optimizations.UseSse2 && op.Size < 2)
4520              {
4521                  Operand n = GetVec(op.Rn);
4522                  Operand m = GetVec(op.Rm);
4523  
4524                  Intrinsic avgInst = op.Size == 0 ? Intrinsic.X86Pavgb : Intrinsic.X86Pavgw;
4525  
4526                  Operand res = context.AddIntrinsic(avgInst, n, m);
4527  
4528                  if (op.RegisterSize == RegisterSize.Simd64)
4529                  {
4530                      res = context.VectorZeroUpper64(res);
4531                  }
4532  
4533                  context.Copy(GetVec(op.Rd), res);
4534              }
4535              else
4536              {
4537                  EmitVectorBinaryOpZx(context, (op1, op2) =>
4538                  {
4539                      Operand res = context.Add(op1, op2);
4540  
4541                      res = context.Add(res, Const(1L));
4542  
4543                      return context.ShiftRightUI(res, Const(1));
4544                  });
4545              }
4546          }
4547  
4548          public static void Usqadd_S(ArmEmitterContext context)
4549          {
4550              if (Optimizations.UseAdvSimd)
4551              {
4552                  InstEmitSimdHelperArm64.EmitScalarSaturatingBinaryOpRd(context, Intrinsic.Arm64UsqaddS);
4553              }
4554              else
4555              {
4556                  EmitScalarSaturatingBinaryOpZx(context, SaturatingFlags.Accumulate);
4557              }
4558          }
4559  
4560          public static void Usqadd_V(ArmEmitterContext context)
4561          {
4562              if (Optimizations.UseAdvSimd)
4563              {
4564                  InstEmitSimdHelperArm64.EmitVectorSaturatingBinaryOpRd(context, Intrinsic.Arm64UsqaddV);
4565              }
4566              else
4567              {
4568                  EmitVectorSaturatingBinaryOpZx(context, SaturatingFlags.Accumulate);
4569              }
4570          }
4571  
4572          public static void Usubl_V(ArmEmitterContext context)
4573          {
4574              if (Optimizations.UseAdvSimd)
4575              {
4576                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UsublV);
4577              }
4578              else if (Optimizations.UseSse41)
4579              {
4580                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4581  
4582                  Operand n = GetVec(op.Rn);
4583                  Operand m = GetVec(op.Rm);
4584  
4585                  if (op.RegisterSize == RegisterSize.Simd128)
4586                  {
4587                      n = context.AddIntrinsic(Intrinsic.X86Psrldq, n, Const(8));
4588                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
4589                  }
4590  
4591                  Intrinsic movInst = X86PmovzxInstruction[op.Size];
4592  
4593                  n = context.AddIntrinsic(movInst, n);
4594                  m = context.AddIntrinsic(movInst, m);
4595  
4596                  Intrinsic subInst = X86PsubInstruction[op.Size + 1];
4597  
4598                  context.Copy(GetVec(op.Rd), context.AddIntrinsic(subInst, n, m));
4599              }
4600              else
4601              {
4602                  EmitVectorWidenRnRmBinaryOpZx(context, (op1, op2) => context.Subtract(op1, op2));
4603              }
4604          }
4605  
4606          public static void Usubw_V(ArmEmitterContext context)
4607          {
4608              if (Optimizations.UseAdvSimd)
4609              {
4610                  InstEmitSimdHelperArm64.EmitVectorBinaryOp(context, Intrinsic.Arm64UsubwV);
4611              }
4612              else if (Optimizations.UseSse41)
4613              {
4614                  OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4615  
4616                  Operand n = GetVec(op.Rn);
4617                  Operand m = GetVec(op.Rm);
4618  
4619                  if (op.RegisterSize == RegisterSize.Simd128)
4620                  {
4621                      m = context.AddIntrinsic(Intrinsic.X86Psrldq, m, Const(8));
4622                  }
4623  
4624                  Intrinsic movInst = X86PmovzxInstruction[op.Size];
4625  
4626                  m = context.AddIntrinsic(movInst, m);
4627  
4628                  Intrinsic subInst = X86PsubInstruction[op.Size + 1];
4629  
4630                  context.Copy(GetVec(op.Rd), context.AddIntrinsic(subInst, n, m));
4631              }
4632              else
4633              {
4634                  EmitVectorWidenRmBinaryOpZx(context, (op1, op2) => context.Subtract(op1, op2));
4635              }
4636          }
4637  
4638          private static Operand EmitAbs(ArmEmitterContext context, Operand value)
4639          {
4640              Operand isPositive = context.ICompareGreaterOrEqual(value, Const(value.Type, 0));
4641  
4642              return context.ConditionalSelect(isPositive, value, context.Negate(value));
4643          }
4644  
4645          private static void EmitAddLongPairwise(ArmEmitterContext context, bool signed, bool accumulate)
4646          {
4647              OpCodeSimd op = (OpCodeSimd)context.CurrOp;
4648  
4649              Operand res = context.VectorZero();
4650  
4651              int pairs = op.GetPairsCount() >> op.Size;
4652  
4653              for (int index = 0; index < pairs; index++)
4654              {
4655                  int pairIndex = index << 1;
4656  
4657                  Operand ne0 = EmitVectorExtract(context, op.Rn, pairIndex, op.Size, signed);
4658                  Operand ne1 = EmitVectorExtract(context, op.Rn, pairIndex + 1, op.Size, signed);
4659  
4660                  Operand e = context.Add(ne0, ne1);
4661  
4662                  if (accumulate)
4663                  {
4664                      Operand de = EmitVectorExtract(context, op.Rd, index, op.Size + 1, signed);
4665  
4666                      e = context.Add(e, de);
4667                  }
4668  
4669                  res = EmitVectorInsert(context, res, e, index, op.Size + 1);
4670              }
4671  
4672              context.Copy(GetVec(op.Rd), res);
4673          }
4674  
4675          private static Operand EmitDoublingMultiplyHighHalf(
4676              ArmEmitterContext context,
4677              Operand n,
4678              Operand m,
4679              bool round)
4680          {
4681              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4682  
4683              int eSize = 8 << op.Size;
4684  
4685              Operand res = context.Multiply(n, m);
4686  
4687              if (!round)
4688              {
4689                  res = context.ShiftRightSI(res, Const(eSize - 1));
4690              }
4691              else
4692              {
4693                  long roundConst = 1L << (eSize - 1);
4694  
4695                  res = context.ShiftLeft(res, Const(1));
4696  
4697                  res = context.Add(res, Const(roundConst));
4698  
4699                  res = context.ShiftRightSI(res, Const(eSize));
4700  
4701                  Operand isIntMin = context.ICompareEqual(res, Const((long)int.MinValue));
4702  
4703                  res = context.ConditionalSelect(isIntMin, context.Negate(res), res);
4704              }
4705  
4706              return res;
4707          }
4708  
4709          private static void EmitHighNarrow(ArmEmitterContext context, Func2I emit, bool round)
4710          {
4711              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
4712  
4713              int elems = 8 >> op.Size;
4714              int eSize = 8 << op.Size;
4715  
4716              int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;
4717  
4718              Operand d = GetVec(op.Rd);
4719  
4720              Operand res = part == 0 ? context.VectorZero() : context.Copy(d);
4721  
4722              long roundConst = 1L << (eSize - 1);
4723  
4724              for (int index = 0; index < elems; index++)
4725              {
4726                  Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size + 1);
4727                  Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size + 1);
4728  
4729                  Operand de = emit(ne, me);
4730  
4731                  if (round)
4732                  {
4733                      de = context.Add(de, Const(roundConst));
4734                  }
4735  
4736                  de = context.ShiftRightUI(de, Const(eSize));
4737  
4738                  res = EmitVectorInsert(context, res, de, part + index, op.Size);
4739              }
4740  
4741              context.Copy(d, res);
4742          }
4743  
4744          private static Operand EmitMax64Op(ArmEmitterContext context, Operand op1, Operand op2, bool signed)
4745          {
4746              Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64);
4747  
4748              Operand cmp = signed
4749                  ? context.ICompareGreaterOrEqual(op1, op2)
4750                  : context.ICompareGreaterOrEqualUI(op1, op2);
4751  
4752              return context.ConditionalSelect(cmp, op1, op2);
4753          }
4754  
4755          private static Operand EmitMin64Op(ArmEmitterContext context, Operand op1, Operand op2, bool signed)
4756          {
4757              Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64);
4758  
4759              Operand cmp = signed
4760                  ? context.ICompareLessOrEqual(op1, op2)
4761                  : context.ICompareLessOrEqualUI(op1, op2);
4762  
4763              return context.ConditionalSelect(cmp, op1, op2);
4764          }
4765  
4766          private static void EmitSse41ScalarRoundOpF(ArmEmitterContext context, FPRoundingMode roundMode)
4767          {
4768              OpCodeSimd op = (OpCodeSimd)context.CurrOp;
4769  
4770              Operand n = GetVec(op.Rn);
4771  
4772              Operand res;
4773  
4774              if (roundMode != FPRoundingMode.ToNearestAway)
4775              {
4776                  Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundsd : Intrinsic.X86Roundss;
4777  
4778                  res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode)));
4779              }
4780              else
4781              {
4782                  res = EmitSse41RoundToNearestWithTiesToAwayOpF(context, n, scalar: true);
4783              }
4784  
4785              if ((op.Size & 1) != 0)
4786              {
4787                  res = context.VectorZeroUpper64(res);
4788              }
4789              else
4790              {
4791                  res = context.VectorZeroUpper96(res);
4792              }
4793  
4794              context.Copy(GetVec(op.Rd), res);
4795          }
4796  
4797          private static void EmitSse41VectorRoundOpF(ArmEmitterContext context, FPRoundingMode roundMode)
4798          {
4799              OpCodeSimd op = (OpCodeSimd)context.CurrOp;
4800  
4801              Operand n = GetVec(op.Rn);
4802  
4803              Operand res;
4804  
4805              if (roundMode != FPRoundingMode.ToNearestAway)
4806              {
4807                  Intrinsic inst = (op.Size & 1) != 0 ? Intrinsic.X86Roundpd : Intrinsic.X86Roundps;
4808  
4809                  res = context.AddIntrinsic(inst, n, Const(X86GetRoundControl(roundMode)));
4810              }
4811              else
4812              {
4813                  res = EmitSse41RoundToNearestWithTiesToAwayOpF(context, n, scalar: false);
4814              }
4815  
4816              if (op.RegisterSize == RegisterSize.Simd64)
4817              {
4818                  res = context.VectorZeroUpper64(res);
4819              }
4820  
4821              context.Copy(GetVec(op.Rd), res);
4822          }
4823  
4824          private static Operand EmitSse41Round32Exp8OpF(ArmEmitterContext context, Operand value, bool scalar)
4825          {
4826              Operand roundMask;
4827              Operand truncMask;
4828              Operand expMask;
4829  
4830              if (scalar)
4831              {
4832                  roundMask = X86GetScalar(context, 0x4000);
4833                  truncMask = X86GetScalar(context, unchecked((int)0xFFFF8000));
4834                  expMask = X86GetScalar(context, 0x7F800000);
4835              }
4836              else
4837              {
4838                  roundMask = X86GetAllElements(context, 0x4000);
4839                  truncMask = X86GetAllElements(context, unchecked((int)0xFFFF8000));
4840                  expMask = X86GetAllElements(context, 0x7F800000);
4841              }
4842  
4843              Operand oValue = value;
4844              Operand masked = context.AddIntrinsic(Intrinsic.X86Pand, value, expMask);
4845              Operand isNaNInf = context.AddIntrinsic(Intrinsic.X86Pcmpeqd, masked, expMask);
4846  
4847              value = context.AddIntrinsic(Intrinsic.X86Paddd, value, roundMask);
4848              value = context.AddIntrinsic(Intrinsic.X86Pand, value, truncMask);
4849  
4850              return context.AddIntrinsic(Intrinsic.X86Blendvps, value, oValue, isNaNInf);
4851          }
4852  
4853          private static Operand EmitSse41RecipStepSelectOpF(
4854              ArmEmitterContext context,
4855              Operand n,
4856              Operand m,
4857              Operand res,
4858              Operand mask,
4859              bool scalar,
4860              int sizeF)
4861          {
4862              Intrinsic cmpOp;
4863              Intrinsic shlOp;
4864              Intrinsic blendOp;
4865              Operand zero = context.VectorZero();
4866              Operand expMask;
4867  
4868              if (sizeF == 0)
4869              {
4870                  cmpOp = Intrinsic.X86Pcmpeqd;
4871                  shlOp = Intrinsic.X86Pslld;
4872                  blendOp = Intrinsic.X86Blendvps;
4873                  expMask = scalar ? X86GetScalar(context, 0x7F800000 << 1) : X86GetAllElements(context, 0x7F800000 << 1);
4874              }
4875              else /* if (sizeF == 1) */
4876              {
4877                  cmpOp = Intrinsic.X86Pcmpeqq;
4878                  shlOp = Intrinsic.X86Psllq;
4879                  blendOp = Intrinsic.X86Blendvpd;
4880                  expMask = scalar ? X86GetScalar(context, 0x7FF0000000000000L << 1) : X86GetAllElements(context, 0x7FF0000000000000L << 1);
4881              }
4882  
4883              n = context.AddIntrinsic(shlOp, n, Const(1));
4884              m = context.AddIntrinsic(shlOp, m, Const(1));
4885  
4886              Operand nZero = context.AddIntrinsic(cmpOp, n, zero);
4887              Operand mZero = context.AddIntrinsic(cmpOp, m, zero);
4888              Operand nInf = context.AddIntrinsic(cmpOp, n, expMask);
4889              Operand mInf = context.AddIntrinsic(cmpOp, m, expMask);
4890  
4891              Operand nmZero = context.AddIntrinsic(Intrinsic.X86Por, nZero, mZero);
4892              Operand nmInf = context.AddIntrinsic(Intrinsic.X86Por, nInf, mInf);
4893              Operand nmZeroInf = context.AddIntrinsic(Intrinsic.X86Pand, nmZero, nmInf);
4894  
4895              return context.AddIntrinsic(blendOp, res, mask, nmZeroInf);
4896          }
4897  
4898          public static void EmitSse2VectorIsNaNOpF(
4899              ArmEmitterContext context,
4900              Operand opF,
4901              out Operand qNaNMask,
4902              out Operand sNaNMask,
4903              bool? isQNaN = null)
4904          {
4905              IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
4906  
4907              if ((op.Size & 1) == 0)
4908              {
4909                  const int QBit = 22;
4910  
4911                  Operand qMask = X86GetAllElements(context, 1 << QBit);
4912  
4913                  Operand mask1 = context.AddIntrinsic(Intrinsic.X86Cmpps, opF, opF, Const((int)CmpCondition.UnorderedQ));
4914  
4915                  Operand mask2 = context.AddIntrinsic(Intrinsic.X86Pand, opF, qMask);
4916                  mask2 = context.AddIntrinsic(Intrinsic.X86Cmpps, mask2, qMask, Const((int)CmpCondition.Equal));
4917  
4918                  qNaNMask = isQNaN == null || (bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andps, mask2, mask1) : default;
4919                  sNaNMask = isQNaN == null || !(bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andnps, mask2, mask1) : default;
4920              }
4921              else /* if ((op.Size & 1) == 1) */
4922              {
4923                  const int QBit = 51;
4924  
4925                  Operand qMask = X86GetAllElements(context, 1L << QBit);
4926  
4927                  Operand mask1 = context.AddIntrinsic(Intrinsic.X86Cmppd, opF, opF, Const((int)CmpCondition.UnorderedQ));
4928  
4929                  Operand mask2 = context.AddIntrinsic(Intrinsic.X86Pand, opF, qMask);
4930                  mask2 = context.AddIntrinsic(Intrinsic.X86Cmppd, mask2, qMask, Const((int)CmpCondition.Equal));
4931  
4932                  qNaNMask = isQNaN == null || (bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andpd, mask2, mask1) : default;
4933                  sNaNMask = isQNaN == null || !(bool)isQNaN ? context.AddIntrinsic(Intrinsic.X86Andnpd, mask2, mask1) : default;
4934              }
4935          }
4936  
4937          public static Operand EmitSse41ProcessNaNsOpF(
4938              ArmEmitterContext context,
4939              Func2I emit,
4940              bool scalar,
4941              Operand n = default,
4942              Operand m = default)
4943          {
4944              Operand nCopy = n == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)) : n;
4945              Operand mCopy = m == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)) : m;
4946  
4947              EmitSse2VectorIsNaNOpF(context, nCopy, out Operand nQNaNMask, out Operand nSNaNMask);
4948              EmitSse2VectorIsNaNOpF(context, mCopy, out _, out Operand mSNaNMask, isQNaN: false);
4949  
4950              int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1;
4951  
4952              if (sizeF == 0)
4953              {
4954                  const int QBit = 22;
4955  
4956                  Operand qMask = scalar ? X86GetScalar(context, 1 << QBit) : X86GetAllElements(context, 1 << QBit);
4957  
4958                  Operand resNaNMask = context.AddIntrinsic(Intrinsic.X86Pandn, mSNaNMask, nQNaNMask);
4959                  resNaNMask = context.AddIntrinsic(Intrinsic.X86Por, resNaNMask, nSNaNMask);
4960  
4961                  Operand resNaN = context.AddIntrinsic(Intrinsic.X86Blendvps, mCopy, nCopy, resNaNMask);
4962                  resNaN = context.AddIntrinsic(Intrinsic.X86Por, resNaN, qMask);
4963  
4964                  Operand resMask = context.AddIntrinsic(Intrinsic.X86Cmpps, nCopy, mCopy, Const((int)CmpCondition.OrderedQ));
4965  
4966                  Operand res = context.AddIntrinsic(Intrinsic.X86Blendvps, resNaN, emit(nCopy, mCopy), resMask);
4967  
4968                  if (n != default || m != default)
4969                  {
4970                      return res;
4971                  }
4972  
4973                  if (scalar)
4974                  {
4975                      res = context.VectorZeroUpper96(res);
4976                  }
4977                  else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64)
4978                  {
4979                      res = context.VectorZeroUpper64(res);
4980                  }
4981  
4982                  context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
4983  
4984                  return default;
4985              }
4986              else /* if (sizeF == 1) */
4987              {
4988                  const int QBit = 51;
4989  
4990                  Operand qMask = scalar ? X86GetScalar(context, 1L << QBit) : X86GetAllElements(context, 1L << QBit);
4991  
4992                  Operand resNaNMask = context.AddIntrinsic(Intrinsic.X86Pandn, mSNaNMask, nQNaNMask);
4993                  resNaNMask = context.AddIntrinsic(Intrinsic.X86Por, resNaNMask, nSNaNMask);
4994  
4995                  Operand resNaN = context.AddIntrinsic(Intrinsic.X86Blendvpd, mCopy, nCopy, resNaNMask);
4996                  resNaN = context.AddIntrinsic(Intrinsic.X86Por, resNaN, qMask);
4997  
4998                  Operand resMask = context.AddIntrinsic(Intrinsic.X86Cmppd, nCopy, mCopy, Const((int)CmpCondition.OrderedQ));
4999  
5000                  Operand res = context.AddIntrinsic(Intrinsic.X86Blendvpd, resNaN, emit(nCopy, mCopy), resMask);
5001  
5002                  if (n != default || m != default)
5003                  {
5004                      return res;
5005                  }
5006  
5007                  if (scalar)
5008                  {
5009                      res = context.VectorZeroUpper64(res);
5010                  }
5011  
5012                  context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
5013  
5014                  return default;
5015              }
5016          }
5017  
5018          private static Operand EmitSse2VectorMaxMinOpF(ArmEmitterContext context, Operand n, Operand m, bool isMax)
5019          {
5020              IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
5021  
5022              if ((op.Size & 1) == 0)
5023              {
5024                  Operand mask = X86GetAllElements(context, -0f);
5025  
5026                  Operand res = context.AddIntrinsic(isMax ? Intrinsic.X86Maxps : Intrinsic.X86Minps, n, m);
5027                  res = context.AddIntrinsic(Intrinsic.X86Andnps, mask, res);
5028  
5029                  Operand resSign = context.AddIntrinsic(isMax ? Intrinsic.X86Pand : Intrinsic.X86Por, n, m);
5030                  resSign = context.AddIntrinsic(Intrinsic.X86Andps, mask, resSign);
5031  
5032                  return context.AddIntrinsic(Intrinsic.X86Por, res, resSign);
5033              }
5034              else /* if ((op.Size & 1) == 1) */
5035              {
5036                  Operand mask = X86GetAllElements(context, -0d);
5037  
5038                  Operand res = context.AddIntrinsic(isMax ? Intrinsic.X86Maxpd : Intrinsic.X86Minpd, n, m);
5039                  res = context.AddIntrinsic(Intrinsic.X86Andnpd, mask, res);
5040  
5041                  Operand resSign = context.AddIntrinsic(isMax ? Intrinsic.X86Pand : Intrinsic.X86Por, n, m);
5042                  resSign = context.AddIntrinsic(Intrinsic.X86Andpd, mask, resSign);
5043  
5044                  return context.AddIntrinsic(Intrinsic.X86Por, res, resSign);
5045              }
5046          }
5047  
5048          private static Operand EmitSse41MaxMinNumOpF(
5049              ArmEmitterContext context,
5050              bool isMaxNum,
5051              bool scalar,
5052              Operand n = default,
5053              Operand m = default)
5054          {
5055              Operand nCopy = n == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)) : n;
5056              Operand mCopy = m == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)) : m;
5057  
5058              EmitSse2VectorIsNaNOpF(context, nCopy, out Operand nQNaNMask, out _, isQNaN: true);
5059              EmitSse2VectorIsNaNOpF(context, mCopy, out Operand mQNaNMask, out _, isQNaN: true);
5060  
5061              int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1;
5062  
5063              if (sizeF == 0)
5064              {
5065                  Operand negInfMask = scalar
5066                      ? X86GetScalar(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity)
5067                      : X86GetAllElements(context, isMaxNum ? float.NegativeInfinity : float.PositiveInfinity);
5068  
5069                  Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnps, mQNaNMask, nQNaNMask);
5070                  Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnps, nQNaNMask, mQNaNMask);
5071  
5072                  nCopy = context.AddIntrinsic(Intrinsic.X86Blendvps, nCopy, negInfMask, nMask);
5073                  mCopy = context.AddIntrinsic(Intrinsic.X86Blendvps, mCopy, negInfMask, mMask);
5074  
5075                  Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
5076                  {
5077                      return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum);
5078                  }, scalar: scalar, nCopy, mCopy);
5079  
5080                  if (n != default || m != default)
5081                  {
5082                      return res;
5083                  }
5084  
5085                  if (scalar)
5086                  {
5087                      res = context.VectorZeroUpper96(res);
5088                  }
5089                  else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64)
5090                  {
5091                      res = context.VectorZeroUpper64(res);
5092                  }
5093  
5094                  context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
5095  
5096                  return default;
5097              }
5098              else /* if (sizeF == 1) */
5099              {
5100                  Operand negInfMask = scalar
5101                      ? X86GetScalar(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity)
5102                      : X86GetAllElements(context, isMaxNum ? double.NegativeInfinity : double.PositiveInfinity);
5103  
5104                  Operand nMask = context.AddIntrinsic(Intrinsic.X86Andnpd, mQNaNMask, nQNaNMask);
5105                  Operand mMask = context.AddIntrinsic(Intrinsic.X86Andnpd, nQNaNMask, mQNaNMask);
5106  
5107                  nCopy = context.AddIntrinsic(Intrinsic.X86Blendvpd, nCopy, negInfMask, nMask);
5108                  mCopy = context.AddIntrinsic(Intrinsic.X86Blendvpd, mCopy, negInfMask, mMask);
5109  
5110                  Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) =>
5111                  {
5112                      return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum);
5113                  }, scalar: scalar, nCopy, mCopy);
5114  
5115                  if (n != default || m != default)
5116                  {
5117                      return res;
5118                  }
5119  
5120                  if (scalar)
5121                  {
5122                      res = context.VectorZeroUpper64(res);
5123                  }
5124  
5125                  context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res);
5126  
5127                  return default;
5128              }
5129          }
5130  
5131          private enum AddSub
5132          {
5133              None,
5134              Add,
5135              Subtract,
5136          }
5137  
5138          private static void EmitSse41VectorMul_AddSub(ArmEmitterContext context, AddSub addSub)
5139          {
5140              OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
5141  
5142              Operand n = GetVec(op.Rn);
5143              Operand m = GetVec(op.Rm);
5144  
5145              Operand res;
5146  
5147              if (op.Size == 0)
5148              {
5149                  Operand ns8 = context.AddIntrinsic(Intrinsic.X86Psrlw, n, Const(8));
5150                  Operand ms8 = context.AddIntrinsic(Intrinsic.X86Psrlw, m, Const(8));
5151  
5152                  res = context.AddIntrinsic(Intrinsic.X86Pmullw, ns8, ms8);
5153  
5154                  res = context.AddIntrinsic(Intrinsic.X86Psllw, res, Const(8));
5155  
5156                  Operand res2 = context.AddIntrinsic(Intrinsic.X86Pmullw, n, m);
5157  
5158                  Operand mask = X86GetAllElements(context, 0x00FF00FF);
5159  
5160                  res = context.AddIntrinsic(Intrinsic.X86Pblendvb, res, res2, mask);
5161              }
5162              else if (op.Size == 1)
5163              {
5164                  res = context.AddIntrinsic(Intrinsic.X86Pmullw, n, m);
5165              }
5166              else
5167              {
5168                  res = context.AddIntrinsic(Intrinsic.X86Pmulld, n, m);
5169              }
5170  
5171              Operand d = GetVec(op.Rd);
5172  
5173              if (addSub == AddSub.Add)
5174              {
5175                  Intrinsic addInst = X86PaddInstruction[op.Size];
5176  
5177                  res = context.AddIntrinsic(addInst, d, res);
5178              }
5179              else if (addSub == AddSub.Subtract)
5180              {
5181                  Intrinsic subInst = X86PsubInstruction[op.Size];
5182  
5183                  res = context.AddIntrinsic(subInst, d, res);
5184              }
5185  
5186              if (op.RegisterSize == RegisterSize.Simd64)
5187              {
5188                  res = context.VectorZeroUpper64(res);
5189              }
5190  
5191              context.Copy(d, res);
5192          }
5193  
5194          private static void EmitSse41VectorSabdOp(
5195              ArmEmitterContext context,
5196              OpCodeSimdReg op,
5197              Operand n,
5198              Operand m,
5199              bool isLong)
5200          {
5201              int size = isLong ? op.Size + 1 : op.Size;
5202  
5203              Intrinsic cmpgtInst = X86PcmpgtInstruction[size];
5204  
5205              Operand cmpMask = context.AddIntrinsic(cmpgtInst, n, m);
5206  
5207              Intrinsic subInst = X86PsubInstruction[size];
5208  
5209              Operand res = context.AddIntrinsic(subInst, n, m);
5210  
5211              res = context.AddIntrinsic(Intrinsic.X86Pand, cmpMask, res);
5212  
5213              Operand res2 = context.AddIntrinsic(subInst, m, n);
5214  
5215              res2 = context.AddIntrinsic(Intrinsic.X86Pandn, cmpMask, res2);
5216  
5217              res = context.AddIntrinsic(Intrinsic.X86Por, res, res2);
5218  
5219              if (!isLong && op.RegisterSize == RegisterSize.Simd64)
5220              {
5221                  res = context.VectorZeroUpper64(res);
5222              }
5223  
5224              context.Copy(GetVec(op.Rd), res);
5225          }
5226  
5227          private static void EmitSse41VectorUabdOp(
5228              ArmEmitterContext context,
5229              OpCodeSimdReg op,
5230              Operand n,
5231              Operand m,
5232              bool isLong)
5233          {
5234              int size = isLong ? op.Size + 1 : op.Size;
5235  
5236              Intrinsic maxInst = X86PmaxuInstruction[size];
5237  
5238              Operand max = context.AddIntrinsic(maxInst, m, n);
5239  
5240              Intrinsic cmpeqInst = X86PcmpeqInstruction[size];
5241  
5242              Operand cmpMask = context.AddIntrinsic(cmpeqInst, max, m);
5243  
5244              Operand onesMask = X86GetAllElements(context, -1L);
5245  
5246              cmpMask = context.AddIntrinsic(Intrinsic.X86Pandn, cmpMask, onesMask);
5247  
5248              Intrinsic subInst = X86PsubInstruction[size];
5249  
5250              Operand res = context.AddIntrinsic(subInst, n, m);
5251              Operand res2 = context.AddIntrinsic(subInst, m, n);
5252  
5253              res = context.AddIntrinsic(Intrinsic.X86Pand, cmpMask, res);
5254              res2 = context.AddIntrinsic(Intrinsic.X86Pandn, cmpMask, res2);
5255  
5256              res = context.AddIntrinsic(Intrinsic.X86Por, res, res2);
5257  
5258              if (!isLong && op.RegisterSize == RegisterSize.Simd64)
5259              {
5260                  res = context.VectorZeroUpper64(res);
5261              }
5262  
5263              context.Copy(GetVec(op.Rd), res);
5264          }
5265  
5266          private static Operand EmitSse2Sll_128(ArmEmitterContext context, Operand op, int shift)
5267          {
5268              // The upper part of op is assumed to be zero.
5269              Debug.Assert(shift >= 0 && shift < 64);
5270  
5271              if (shift == 0)
5272              {
5273                  return op;
5274              }
5275  
5276              Operand high = context.AddIntrinsic(Intrinsic.X86Pslldq, op, Const(8));
5277              high = context.AddIntrinsic(Intrinsic.X86Psrlq, high, Const(64 - shift));
5278  
5279              Operand low = context.AddIntrinsic(Intrinsic.X86Psllq, op, Const(shift));
5280  
5281              return context.AddIntrinsic(Intrinsic.X86Por, high, low);
5282          }
5283      }
5284  }