/ src / ARMeilleure / Instructions / InstEmitSimdMove32.cs
InstEmitSimdMove32.cs
  1  using ARMeilleure.Decoders;
  2  using ARMeilleure.IntermediateRepresentation;
  3  using ARMeilleure.Translation;
  4  using System;
  5  using static ARMeilleure.Instructions.InstEmitHelper;
  6  using static ARMeilleure.Instructions.InstEmitSimdHelper;
  7  using static ARMeilleure.Instructions.InstEmitSimdHelper32;
  8  using static ARMeilleure.IntermediateRepresentation.Operand.Factory;
  9  
 10  namespace ARMeilleure.Instructions
 11  {
 12      static partial class InstEmit32
 13      {
 14          #region "Masks"
 15          // Same as InstEmitSimdMove, as the instructions do the same thing.
 16          private static readonly long[] _masksE0_Uzp = new long[]
 17          {
 18              13L << 56 | 09L << 48 | 05L << 40 | 01L << 32 | 12L << 24 | 08L << 16 | 04L << 8 | 00L << 0,
 19              11L << 56 | 10L << 48 | 03L << 40 | 02L << 32 | 09L << 24 | 08L << 16 | 01L << 8 | 00L << 0,
 20          };
 21  
 22          private static readonly long[] _masksE1_Uzp = new long[]
 23          {
 24              15L << 56 | 11L << 48 | 07L << 40 | 03L << 32 | 14L << 24 | 10L << 16 | 06L << 8 | 02L << 0,
 25              15L << 56 | 14L << 48 | 07L << 40 | 06L << 32 | 13L << 24 | 12L << 16 | 05L << 8 | 04L << 0,
 26          };
 27          #endregion
 28  
 29          public static void Vmov_I(ArmEmitterContext context)
 30          {
 31              EmitVectorImmUnaryOp32(context, (op1) => op1);
 32          }
 33  
 34          public static void Vmvn_I(ArmEmitterContext context)
 35          {
 36              if (Optimizations.UseAvx512Ortho)
 37              {
 38                  EmitVectorUnaryOpSimd32(context, (op1) =>
 39                  {
 40                      return context.AddIntrinsic(Intrinsic.X86Vpternlogd, op1, op1, Const(0b01010101));
 41                  });
 42              }
 43              else if (Optimizations.UseSse2)
 44              {
 45                  EmitVectorUnaryOpSimd32(context, (op1) =>
 46                  {
 47                      Operand mask = X86GetAllElements(context, -1L);
 48                      return context.AddIntrinsic(Intrinsic.X86Pandn, op1, mask);
 49                  });
 50              }
 51              else
 52              {
 53                  EmitVectorUnaryOpZx32(context, (op1) => context.BitwiseNot(op1));
 54              }
 55          }
 56  
 57          public static void Vmvn_II(ArmEmitterContext context)
 58          {
 59              EmitVectorImmUnaryOp32(context, (op1) => context.BitwiseNot(op1));
 60          }
 61  
 62          public static void Vmov_GS(ArmEmitterContext context)
 63          {
 64              OpCode32SimdMovGp op = (OpCode32SimdMovGp)context.CurrOp;
 65  
 66              Operand vec = GetVecA32(op.Vn >> 2);
 67              if (op.Op == 1)
 68              {
 69                  // To general purpose.
 70                  Operand value = context.VectorExtract(OperandType.I32, vec, op.Vn & 0x3);
 71                  SetIntA32(context, op.Rt, value);
 72              }
 73              else
 74              {
 75                  // From general purpose.
 76                  Operand value = GetIntA32(context, op.Rt);
 77                  context.Copy(vec, context.VectorInsert(vec, value, op.Vn & 0x3));
 78              }
 79          }
 80  
 81          public static void Vmov_G1(ArmEmitterContext context)
 82          {
 83              OpCode32SimdMovGpElem op = (OpCode32SimdMovGpElem)context.CurrOp;
 84  
 85              int index = op.Index + ((op.Vd & 1) << (3 - op.Size));
 86              if (op.Op == 1)
 87              {
 88                  // To general purpose.
 89                  Operand value = EmitVectorExtract32(context, op.Vd >> 1, index, op.Size, !op.U);
 90                  SetIntA32(context, op.Rt, value);
 91              }
 92              else
 93              {
 94                  // From general purpose.
 95                  Operand vec = GetVecA32(op.Vd >> 1);
 96                  Operand value = GetIntA32(context, op.Rt);
 97                  context.Copy(vec, EmitVectorInsert(context, vec, value, index, op.Size));
 98              }
 99          }
100  
101          public static void Vmov_G2(ArmEmitterContext context)
102          {
103              OpCode32SimdMovGpDouble op = (OpCode32SimdMovGpDouble)context.CurrOp;
104  
105              Operand vec = GetVecA32(op.Vm >> 2);
106              int vm1 = op.Vm + 1;
107              bool sameOwnerVec = (op.Vm >> 2) == (vm1 >> 2);
108              Operand vec2 = sameOwnerVec ? vec : GetVecA32(vm1 >> 2);
109              if (op.Op == 1)
110              {
111                  // To general purpose.
112                  Operand lowValue = context.VectorExtract(OperandType.I32, vec, op.Vm & 3);
113                  SetIntA32(context, op.Rt, lowValue);
114  
115                  Operand highValue = context.VectorExtract(OperandType.I32, vec2, vm1 & 3);
116                  SetIntA32(context, op.Rt2, highValue);
117              }
118              else
119              {
120                  // From general purpose.
121                  Operand lowValue = GetIntA32(context, op.Rt);
122                  Operand resultVec = context.VectorInsert(vec, lowValue, op.Vm & 3);
123  
124                  Operand highValue = GetIntA32(context, op.Rt2);
125  
126                  if (sameOwnerVec)
127                  {
128                      context.Copy(vec, context.VectorInsert(resultVec, highValue, vm1 & 3));
129                  }
130                  else
131                  {
132                      context.Copy(vec, resultVec);
133                      context.Copy(vec2, context.VectorInsert(vec2, highValue, vm1 & 3));
134                  }
135              }
136          }
137  
138          public static void Vmov_GD(ArmEmitterContext context)
139          {
140              OpCode32SimdMovGpDouble op = (OpCode32SimdMovGpDouble)context.CurrOp;
141  
142              Operand vec = GetVecA32(op.Vm >> 1);
143              if (op.Op == 1)
144              {
145                  // To general purpose.
146                  Operand value = context.VectorExtract(OperandType.I64, vec, op.Vm & 1);
147                  SetIntA32(context, op.Rt, context.ConvertI64ToI32(value));
148                  SetIntA32(context, op.Rt2, context.ConvertI64ToI32(context.ShiftRightUI(value, Const(32))));
149              }
150              else
151              {
152                  // From general purpose.
153                  Operand lowValue = GetIntA32(context, op.Rt);
154                  Operand highValue = GetIntA32(context, op.Rt2);
155  
156                  Operand value = context.BitwiseOr(
157                      context.ZeroExtend32(OperandType.I64, lowValue),
158                      context.ShiftLeft(context.ZeroExtend32(OperandType.I64, highValue), Const(32)));
159  
160                  context.Copy(vec, context.VectorInsert(vec, value, op.Vm & 1));
161              }
162          }
163  
164          public static void Vmovl(ArmEmitterContext context)
165          {
166              OpCode32SimdLong op = (OpCode32SimdLong)context.CurrOp;
167  
168              Operand res = context.VectorZero();
169  
170              int elems = op.GetBytesCount() >> op.Size;
171  
172              for (int index = 0; index < elems; index++)
173              {
174                  Operand me = EmitVectorExtract32(context, op.Qm, op.Im + index, op.Size, !op.U);
175  
176                  if (op.Size == 2)
177                  {
178                      if (op.U)
179                      {
180                          me = context.ZeroExtend32(OperandType.I64, me);
181                      }
182                      else
183                      {
184                          me = context.SignExtend32(OperandType.I64, me);
185                      }
186                  }
187  
188                  res = EmitVectorInsert(context, res, me, index, op.Size + 1);
189              }
190  
191              context.Copy(GetVecA32(op.Qd), res);
192          }
193  
194          public static void Vswp(ArmEmitterContext context)
195          {
196              OpCode32Simd op = (OpCode32Simd)context.CurrOp;
197  
198              if (op.Q)
199              {
200                  Operand temp = context.Copy(GetVecA32(op.Qd));
201  
202                  context.Copy(GetVecA32(op.Qd), GetVecA32(op.Qm));
203                  context.Copy(GetVecA32(op.Qm), temp);
204              }
205              else
206              {
207                  Operand temp = ExtractScalar(context, OperandType.I64, op.Vd);
208  
209                  InsertScalar(context, op.Vd, ExtractScalar(context, OperandType.I64, op.Vm));
210                  InsertScalar(context, op.Vm, temp);
211              }
212          }
213  
214          public static void Vtbl(ArmEmitterContext context)
215          {
216              OpCode32SimdTbl op = (OpCode32SimdTbl)context.CurrOp;
217  
218              bool extension = op.Opc == 1;
219              int length = op.Length + 1;
220  
221              if (Optimizations.UseSsse3)
222              {
223                  Operand d = GetVecA32(op.Qd);
224                  Operand m = EmitMoveDoubleWordToSide(context, GetVecA32(op.Qm), op.Vm, 0);
225  
226                  Operand res;
227                  Operand mask = X86GetAllElements(context, 0x0707070707070707L);
228  
229                  // Fast path for single register table.
230                  {
231                      Operand n = EmitMoveDoubleWordToSide(context, GetVecA32(op.Qn), op.Vn, 0);
232  
233                      Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, mask);
234                      mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, m);
235  
236                      res = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mMask);
237                  }
238  
239                  for (int index = 1; index < length; index++)
240                  {
241                      int newVn = (op.Vn + index) & 0x1F;
242                      (int qn, _) = GetQuadwordAndSubindex(newVn, op.RegisterSize);
243                      Operand ni = EmitMoveDoubleWordToSide(context, GetVecA32(qn), newVn, 0);
244  
245                      Operand idxMask = X86GetAllElements(context, 0x0808080808080808L * index);
246  
247                      Operand mSubMask = context.AddIntrinsic(Intrinsic.X86Psubb, m, idxMask);
248  
249                      Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, mSubMask, mask);
250                      mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, mSubMask);
251  
252                      Operand res2 = context.AddIntrinsic(Intrinsic.X86Pshufb, ni, mMask);
253  
254                      res = context.AddIntrinsic(Intrinsic.X86Por, res, res2);
255                  }
256  
257                  if (extension)
258                  {
259                      Operand idxMask = X86GetAllElements(context, (0x0808080808080808L * length) - 0x0101010101010101L);
260                      Operand zeroMask = context.VectorZero();
261  
262                      Operand mPosMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, idxMask);
263                      Operand mNegMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, zeroMask, m);
264  
265                      Operand mMask = context.AddIntrinsic(Intrinsic.X86Por, mPosMask, mNegMask);
266  
267                      Operand dMask = context.AddIntrinsic(Intrinsic.X86Pand, EmitMoveDoubleWordToSide(context, d, op.Vd, 0), mMask);
268  
269                      res = context.AddIntrinsic(Intrinsic.X86Por, res, dMask);
270                  }
271  
272                  res = EmitMoveDoubleWordToSide(context, res, 0, op.Vd);
273  
274                  context.Copy(d, EmitDoubleWordInsert(context, d, res, op.Vd));
275              }
276              else
277              {
278                  int elems = op.GetBytesCount() >> op.Size;
279  
280                  (int Qx, int Ix)[] tableTuples = new (int, int)[length];
281                  for (int i = 0; i < length; i++)
282                  {
283                      tableTuples[i] = GetQuadwordAndSubindex(op.Vn + i, op.RegisterSize);
284                  }
285  
286                  int byteLength = length * 8;
287  
288                  Operand res = GetVecA32(op.Qd);
289                  Operand m = GetVecA32(op.Qm);
290  
291                  for (int index = 0; index < elems; index++)
292                  {
293                      Operand selectedIndex = context.ZeroExtend8(OperandType.I32, context.VectorExtract8(m, index + op.Im));
294  
295                      Operand inRange = context.ICompareLess(selectedIndex, Const(byteLength));
296                      Operand elemRes = default; // Note: This is I64 for ease of calculation.
297  
298                      // TODO: Branching rather than conditional select.
299  
300                      // Get indexed byte.
301                      // To simplify (ha) the il, we get bytes from every vector and use a nested conditional select to choose the right result.
302                      // This does have to extract `length` times for every element but certainly not as bad as it could be.
303  
304                      // Which vector number is the index on.
305                      Operand vecIndex = context.ShiftRightUI(selectedIndex, Const(3));
306                      // What should we shift by to extract it.
307                      Operand subVecIndexShift = context.ShiftLeft(context.BitwiseAnd(selectedIndex, Const(7)), Const(3));
308  
309                      for (int i = 0; i < length; i++)
310                      {
311                          (int qx, int ix) = tableTuples[i];
312                          // Get the whole vector, we'll get a byte out of it.
313                          Operand lookupResult;
314                          if (qx == op.Qd)
315                          {
316                              // Result contains the current state of the vector.
317                              lookupResult = context.VectorExtract(OperandType.I64, res, ix);
318                          }
319                          else
320                          {
321                              lookupResult = EmitVectorExtract32(context, qx, ix, 3, false); // I64
322                          }
323  
324                          lookupResult = context.ShiftRightUI(lookupResult, subVecIndexShift); // Get the relevant byte from this vector.
325  
326                          if (i == 0)
327                          {
328                              elemRes = lookupResult; // First result is always default.
329                          }
330                          else
331                          {
332                              Operand isThisElem = context.ICompareEqual(vecIndex, Const(i));
333                              elemRes = context.ConditionalSelect(isThisElem, lookupResult, elemRes);
334                          }
335                      }
336  
337                      Operand fallback = (extension) ? context.ZeroExtend32(OperandType.I64, EmitVectorExtract32(context, op.Qd, index + op.Id, 0, false)) : Const(0L);
338  
339                      res = EmitVectorInsert(context, res, context.ConditionalSelect(inRange, elemRes, fallback), index + op.Id, 0);
340                  }
341  
342                  context.Copy(GetVecA32(op.Qd), res);
343              }
344          }
345  
346          public static void Vtrn(ArmEmitterContext context)
347          {
348              OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp;
349  
350              if (Optimizations.UseSsse3)
351              {
352                  EmitVectorShuffleOpSimd32(context, (m, d) =>
353                  {
354                      Operand mask = default;
355  
356                      if (op.Size < 3)
357                      {
358                          long maskE0 = EvenMasks[op.Size];
359                          long maskE1 = OddMasks[op.Size];
360  
361                          mask = X86GetScalar(context, maskE0);
362  
363                          mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3);
364                      }
365  
366                      if (op.Size < 3)
367                      {
368                          d = context.AddIntrinsic(Intrinsic.X86Pshufb, d, mask);
369                          m = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mask);
370                      }
371  
372                      Operand resD = context.AddIntrinsic(X86PunpcklInstruction[op.Size], d, m);
373                      Operand resM = context.AddIntrinsic(X86PunpckhInstruction[op.Size], d, m);
374  
375                      return (resM, resD);
376                  });
377              }
378              else
379              {
380                  int elems = op.GetBytesCount() >> op.Size;
381                  int pairs = elems >> 1;
382  
383                  bool overlap = op.Qm == op.Qd;
384  
385                  Operand resD = GetVecA32(op.Qd);
386                  Operand resM = GetVecA32(op.Qm);
387  
388                  for (int index = 0; index < pairs; index++)
389                  {
390                      int pairIndex = index << 1;
391                      Operand d2 = EmitVectorExtract32(context, op.Qd, pairIndex + 1 + op.Id, op.Size, false);
392                      Operand m1 = EmitVectorExtract32(context, op.Qm, pairIndex + op.Im, op.Size, false);
393  
394                      resD = EmitVectorInsert(context, resD, m1, pairIndex + 1 + op.Id, op.Size);
395  
396                      if (overlap)
397                      {
398                          resM = resD;
399                      }
400  
401                      resM = EmitVectorInsert(context, resM, d2, pairIndex + op.Im, op.Size);
402  
403                      if (overlap)
404                      {
405                          resD = resM;
406                      }
407                  }
408  
409                  context.Copy(GetVecA32(op.Qd), resD);
410                  if (!overlap)
411                  {
412                      context.Copy(GetVecA32(op.Qm), resM);
413                  }
414              }
415          }
416  
417          public static void Vzip(ArmEmitterContext context)
418          {
419              OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp;
420  
421              if (Optimizations.UseAdvSimd)
422              {
423                  EmitVectorZipUzpOpSimd32(context, Intrinsic.Arm64Zip1V, Intrinsic.Arm64Zip2V);
424              }
425              else if (Optimizations.UseSse2)
426              {
427                  EmitVectorShuffleOpSimd32(context, (m, d) =>
428                  {
429                      if (op.RegisterSize == RegisterSize.Simd128)
430                      {
431                          Operand resD = context.AddIntrinsic(X86PunpcklInstruction[op.Size], d, m);
432                          Operand resM = context.AddIntrinsic(X86PunpckhInstruction[op.Size], d, m);
433  
434                          return (resM, resD);
435                      }
436                      else
437                      {
438                          Operand res = context.AddIntrinsic(X86PunpcklInstruction[op.Size], d, m);
439  
440                          Operand resD = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, res, context.VectorZero());
441                          Operand resM = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, res, context.VectorZero());
442                          return (resM, resD);
443                      }
444                  });
445              }
446              else
447              {
448                  int elems = op.GetBytesCount() >> op.Size;
449                  int pairs = elems >> 1;
450  
451                  bool overlap = op.Qm == op.Qd;
452  
453                  Operand resD = GetVecA32(op.Qd);
454                  Operand resM = GetVecA32(op.Qm);
455  
456                  for (int index = 0; index < pairs; index++)
457                  {
458                      int pairIndex = index << 1;
459                      Operand dRowD = EmitVectorExtract32(context, op.Qd, index + op.Id, op.Size, false);
460                      Operand mRowD = EmitVectorExtract32(context, op.Qm, index + op.Im, op.Size, false);
461  
462                      Operand dRowM = EmitVectorExtract32(context, op.Qd, index + op.Id + pairs, op.Size, false);
463                      Operand mRowM = EmitVectorExtract32(context, op.Qm, index + op.Im + pairs, op.Size, false);
464  
465                      resD = EmitVectorInsert(context, resD, dRowD, pairIndex + op.Id, op.Size);
466                      resD = EmitVectorInsert(context, resD, mRowD, pairIndex + 1 + op.Id, op.Size);
467  
468                      if (overlap)
469                      {
470                          resM = resD;
471                      }
472  
473                      resM = EmitVectorInsert(context, resM, dRowM, pairIndex + op.Im, op.Size);
474                      resM = EmitVectorInsert(context, resM, mRowM, pairIndex + 1 + op.Im, op.Size);
475  
476                      if (overlap)
477                      {
478                          resD = resM;
479                      }
480                  }
481  
482                  context.Copy(GetVecA32(op.Qd), resD);
483                  if (!overlap)
484                  {
485                      context.Copy(GetVecA32(op.Qm), resM);
486                  }
487              }
488          }
489  
490          public static void Vuzp(ArmEmitterContext context)
491          {
492              OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp;
493  
494              if (Optimizations.UseAdvSimd)
495              {
496                  EmitVectorZipUzpOpSimd32(context, Intrinsic.Arm64Uzp1V, Intrinsic.Arm64Uzp2V);
497              }
498              else if (Optimizations.UseSsse3)
499              {
500                  EmitVectorShuffleOpSimd32(context, (m, d) =>
501                  {
502                      if (op.RegisterSize == RegisterSize.Simd128)
503                      {
504                          Operand mask = default;
505  
506                          if (op.Size < 3)
507                          {
508                              long maskE0 = EvenMasks[op.Size];
509                              long maskE1 = OddMasks[op.Size];
510  
511                              mask = X86GetScalar(context, maskE0);
512                              mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3);
513  
514                              d = context.AddIntrinsic(Intrinsic.X86Pshufb, d, mask);
515                              m = context.AddIntrinsic(Intrinsic.X86Pshufb, m, mask);
516                          }
517  
518                          Operand resD = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, d, m);
519                          Operand resM = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, d, m);
520  
521                          return (resM, resD);
522                      }
523                      else
524                      {
525                          Intrinsic punpcklInst = X86PunpcklInstruction[op.Size];
526  
527                          Operand res = context.AddIntrinsic(punpcklInst, d, m);
528  
529                          if (op.Size < 2)
530                          {
531                              long maskE0 = _masksE0_Uzp[op.Size];
532                              long maskE1 = _masksE1_Uzp[op.Size];
533  
534                              Operand mask = X86GetScalar(context, maskE0);
535  
536                              mask = EmitVectorInsert(context, mask, Const(maskE1), 1, 3);
537  
538                              res = context.AddIntrinsic(Intrinsic.X86Pshufb, res, mask);
539                          }
540  
541                          Operand resD = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, res, context.VectorZero());
542                          Operand resM = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, res, context.VectorZero());
543  
544                          return (resM, resD);
545                      }
546                  });
547              }
548              else
549              {
550                  int elems = op.GetBytesCount() >> op.Size;
551                  int pairs = elems >> 1;
552  
553                  bool overlap = op.Qm == op.Qd;
554  
555                  Operand resD = GetVecA32(op.Qd);
556                  Operand resM = GetVecA32(op.Qm);
557  
558                  for (int index = 0; index < elems; index++)
559                  {
560                      Operand dIns, mIns;
561                      if (index >= pairs)
562                      {
563                          int pairIndex = index - pairs;
564                          dIns = EmitVectorExtract32(context, op.Qm, (pairIndex << 1) + op.Im, op.Size, false);
565                          mIns = EmitVectorExtract32(context, op.Qm, ((pairIndex << 1) | 1) + op.Im, op.Size, false);
566                      }
567                      else
568                      {
569                          dIns = EmitVectorExtract32(context, op.Qd, (index << 1) + op.Id, op.Size, false);
570                          mIns = EmitVectorExtract32(context, op.Qd, ((index << 1) | 1) + op.Id, op.Size, false);
571                      }
572  
573                      resD = EmitVectorInsert(context, resD, dIns, index + op.Id, op.Size);
574  
575                      if (overlap)
576                      {
577                          resM = resD;
578                      }
579  
580                      resM = EmitVectorInsert(context, resM, mIns, index + op.Im, op.Size);
581  
582                      if (overlap)
583                      {
584                          resD = resM;
585                      }
586                  }
587  
588                  context.Copy(GetVecA32(op.Qd), resD);
589                  if (!overlap)
590                  {
591                      context.Copy(GetVecA32(op.Qm), resM);
592                  }
593              }
594          }
595  
596          private static void EmitVectorZipUzpOpSimd32(ArmEmitterContext context, Intrinsic inst1, Intrinsic inst2)
597          {
598              OpCode32SimdCmpZ op = (OpCode32SimdCmpZ)context.CurrOp;
599  
600              bool overlap = op.Qm == op.Qd;
601  
602              Operand d = GetVecA32(op.Qd);
603              Operand m = GetVecA32(op.Qm);
604  
605              Operand dPart = d;
606              Operand mPart = m;
607  
608              if (!op.Q) // Register swap: move relevant doubleword to destination side.
609              {
610                  dPart = InstEmitSimdHelper32Arm64.EmitMoveDoubleWordToSide(context, d, op.Vd, 0);
611                  mPart = InstEmitSimdHelper32Arm64.EmitMoveDoubleWordToSide(context, m, op.Vm, 0);
612              }
613  
614              Intrinsic vSize = op.Q ? Intrinsic.Arm64V128 : Intrinsic.Arm64V64;
615  
616              vSize |= (Intrinsic)(op.Size << (int)Intrinsic.Arm64VSizeShift);
617  
618              Operand resD = context.AddIntrinsic(inst1 | vSize, dPart, mPart);
619              Operand resM = context.AddIntrinsic(inst2 | vSize, dPart, mPart);
620  
621              if (!op.Q) // Register insert.
622              {
623                  resD = context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VDWord, d, Const(op.Vd & 1), resD, Const(0));
624  
625                  if (overlap)
626                  {
627                      resD = context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VDWord, resD, Const(op.Vm & 1), resM, Const(0));
628                  }
629                  else
630                  {
631                      resM = context.AddIntrinsic(Intrinsic.Arm64InsVe | Intrinsic.Arm64VDWord, m, Const(op.Vm & 1), resM, Const(0));
632                  }
633              }
634  
635              context.Copy(d, resD);
636              if (!overlap)
637              {
638                  context.Copy(m, resM);
639              }
640          }
641  
642          private static void EmitVectorShuffleOpSimd32(ArmEmitterContext context, Func<Operand, Operand, (Operand, Operand)> shuffleFunc)
643          {
644              OpCode32Simd op = (OpCode32Simd)context.CurrOp;
645  
646              Operand m = GetVecA32(op.Qm);
647              Operand d = GetVecA32(op.Qd);
648              Operand initialM = m;
649              Operand initialD = d;
650  
651              if (!op.Q) // Register swap: move relevant doubleword to side 0, for consistency.
652              {
653                  m = EmitMoveDoubleWordToSide(context, m, op.Vm, 0);
654                  d = EmitMoveDoubleWordToSide(context, d, op.Vd, 0);
655              }
656  
657              (Operand resM, Operand resD) = shuffleFunc(m, d);
658  
659              bool overlap = op.Qm == op.Qd;
660  
661              if (!op.Q) // Register insert.
662              {
663                  resM = EmitDoubleWordInsert(context, initialM, EmitMoveDoubleWordToSide(context, resM, 0, op.Vm), op.Vm);
664                  resD = EmitDoubleWordInsert(context, overlap ? resM : initialD, EmitMoveDoubleWordToSide(context, resD, 0, op.Vd), op.Vd);
665              }
666  
667              if (!overlap)
668              {
669                  context.Copy(initialM, resM);
670              }
671  
672              context.Copy(initialD, resD);
673          }
674      }
675  }