Convolve.cs
  1  using Ryujinx.Common.Memory;
  2  using Ryujinx.Graphics.Nvdec.Vp9.Common;
  3  using System.Diagnostics;
  4  using System.Runtime.CompilerServices;
  5  using System.Runtime.Intrinsics;
  6  using System.Runtime.Intrinsics.X86;
  7  using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
  8  
  9  namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 10  {
 11      internal static class Convolve
 12      {
 13          private const bool UseIntrinsics = true;
 14  
 15          [MethodImpl(MethodImplOptions.AggressiveInlining)]
 16          private static Vector128<int> MultiplyAddAdjacent(
 17              Vector128<short> vsrc0,
 18              Vector128<short> vsrc1,
 19              Vector128<short> vsrc2,
 20              Vector128<short> vsrc3,
 21              Vector128<short> vfilter,
 22              Vector128<int> zero)
 23          {
 24              // < sumN, sumN, sumN, sumN >
 25              Vector128<int> sum0 = Sse2.MultiplyAddAdjacent(vsrc0, vfilter);
 26              Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(vsrc1, vfilter);
 27              Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(vsrc2, vfilter);
 28              Vector128<int> sum3 = Sse2.MultiplyAddAdjacent(vsrc3, vfilter);
 29  
 30              // < 0, 0, sumN, sumN >
 31              sum0 = Ssse3.HorizontalAdd(sum0, zero);
 32              sum1 = Ssse3.HorizontalAdd(sum1, zero);
 33              sum2 = Ssse3.HorizontalAdd(sum2, zero);
 34              sum3 = Ssse3.HorizontalAdd(sum3, zero);
 35  
 36              // < 0, 0, 0, sumN >
 37              sum0 = Ssse3.HorizontalAdd(sum0, zero);
 38              sum1 = Ssse3.HorizontalAdd(sum1, zero);
 39              sum2 = Ssse3.HorizontalAdd(sum2, zero);
 40              sum3 = Ssse3.HorizontalAdd(sum3, zero);
 41  
 42              // < 0, 0, sum1, sum0 >
 43              Vector128<int> sum01 = Sse2.UnpackLow(sum0, sum1);
 44  
 45              // < 0, 0, sum3, sum2 >
 46              Vector128<int> sum23 = Sse2.UnpackLow(sum2, sum3);
 47  
 48              // < sum3, sum2, sum1, sum0 >
 49              return Sse.MoveLowToHigh(sum01.AsSingle(), sum23.AsSingle()).AsInt32();
 50          }
 51  
 52          [MethodImpl(MethodImplOptions.AggressiveInlining)]
 53          private static Vector128<int> RoundShift(Vector128<int> value, Vector128<int> const64)
 54          {
 55              return Sse2.ShiftRightArithmetic(Sse2.Add(value, const64), FilterBits);
 56          }
 57  
 58          [MethodImpl(MethodImplOptions.AggressiveInlining)]
 59          private static Vector128<byte> PackUnsignedSaturate(Vector128<int> value, Vector128<int> zero)
 60          {
 61              return Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16());
 62          }
 63  
 64          [MethodImpl(MethodImplOptions.AggressiveInlining)]
 65          private static unsafe void ConvolveHorizSse41(
 66              byte* src,
 67              int srcStride,
 68              byte* dst,
 69              int dstStride,
 70              Array8<short>[] xFilters,
 71              int x0Q4,
 72              int w,
 73              int h)
 74          {
 75              Vector128<int> zero = Vector128<int>.Zero;
 76              Vector128<int> const64 = Vector128.Create(64);
 77  
 78              ulong x, y;
 79              src -= SubpelTaps / 2 - 1;
 80  
 81              fixed (Array8<short>* xFilter = xFilters)
 82              {
 83                  Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8);
 84  
 85                  for (y = 0; y < (uint)h; ++y)
 86                  {
 87                      ulong srcOffset = (uint)x0Q4 >> SubpelBits;
 88                      for (x = 0; x < (uint)w; x += 4)
 89                      {
 90                          Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]);
 91                          Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]);
 92                          Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]);
 93                          Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]);
 94  
 95                          Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
 96  
 97                          Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
 98                      }
 99                      src += srcStride;
100                      dst += dstStride;
101                  }
102              }
103          }
104  
105          [MethodImpl(MethodImplOptions.AggressiveInlining)]
106          private static unsafe void ConvolveHoriz(
107              byte* src,
108              int srcStride,
109              byte* dst,
110              int dstStride,
111              Array8<short>[] xFilters,
112              int x0Q4,
113              int xStepQ4,
114              int w,
115              int h)
116          {
117              if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits)
118              {
119                  ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h);
120  
121                  return;
122              }
123  
124              int x, y;
125              src -= SubpelTaps / 2 - 1;
126  
127              for (y = 0; y < h; ++y)
128              {
129                  int xQ4 = x0Q4;
130                  for (x = 0; x < w; ++x)
131                  {
132                      byte* srcX = &src[xQ4 >> SubpelBits];
133                      ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
134                      int k, sum = 0;
135                      for (k = 0; k < SubpelTaps; ++k)
136                      {
137                          sum += srcX[k] * xFilter[k];
138                      }
139  
140                      dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
141                      xQ4 += xStepQ4;
142                  }
143                  src += srcStride;
144                  dst += dstStride;
145              }
146          }
147  
148          private static unsafe void ConvolveAvgHoriz(
149              byte* src,
150              int srcStride,
151              byte* dst,
152              int dstStride,
153              Array8<short>[] xFilters,
154              int x0Q4,
155              int xStepQ4,
156              int w,
157              int h)
158          {
159              int x, y;
160              src -= SubpelTaps / 2 - 1;
161  
162              for (y = 0; y < h; ++y)
163              {
164                  int xQ4 = x0Q4;
165                  for (x = 0; x < w; ++x)
166                  {
167                      byte* srcX = &src[xQ4 >> SubpelBits];
168                      ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
169                      int k, sum = 0;
170                      for (k = 0; k < SubpelTaps; ++k)
171                      {
172                          sum += srcX[k] * xFilter[k];
173                      }
174  
175                      dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
176                      xQ4 += xStepQ4;
177                  }
178                  src += srcStride;
179                  dst += dstStride;
180              }
181          }
182  
183          [MethodImpl(MethodImplOptions.AggressiveInlining)]
184          private static unsafe void ConvolveVertAvx2(
185              byte* src,
186              int srcStride,
187              byte* dst,
188              int dstStride,
189              Array8<short>[] yFilters,
190              int y0Q4,
191              int w,
192              int h)
193          {
194              Vector128<int> zero = Vector128<int>.Zero;
195              Vector128<int> const64 = Vector128.Create(64);
196              Vector256<int> indices = Vector256.Create(
197                  0,
198                  srcStride,
199                  srcStride * 2,
200                  srcStride * 3,
201                  srcStride * 4,
202                  srcStride * 5,
203                  srcStride * 6,
204                  srcStride * 7);
205  
206              ulong x, y;
207              src -= srcStride * (SubpelTaps / 2 - 1);
208  
209              fixed (Array8<short>* yFilter = yFilters)
210              {
211                  Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8);
212  
213                  ulong srcBaseY = (uint)y0Q4 >> SubpelBits;
214                  for (y = 0; y < (uint)h; ++y)
215                  {
216                      ulong srcOffset = (srcBaseY + y) * (uint)srcStride;
217                      for (x = 0; x < (uint)w; x += 4)
218                      {
219                          Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32();
220  
221                          Vector128<int> vsrcL = vsrc.GetLower();
222                          Vector128<int> vsrcH = vsrc.GetUpper();
223  
224                          Vector128<byte> vsrcUnpck11 = Sse2.UnpackLow(vsrcL.AsByte(), vsrcH.AsByte());
225                          Vector128<byte> vsrcUnpck12 = Sse2.UnpackHigh(vsrcL.AsByte(), vsrcH.AsByte());
226  
227                          Vector128<byte> vsrcUnpck21 = Sse2.UnpackLow(vsrcUnpck11, vsrcUnpck12);
228                          Vector128<byte> vsrcUnpck22 = Sse2.UnpackHigh(vsrcUnpck11, vsrcUnpck12);
229  
230                          Vector128<byte> vsrc01 = Sse2.UnpackLow(vsrcUnpck21, vsrcUnpck22);
231                          Vector128<byte> vsrc23 = Sse2.UnpackHigh(vsrcUnpck21, vsrcUnpck22);
232  
233                          Vector128<byte> vsrc11 = Sse.MoveHighToLow(vsrc01.AsSingle(), vsrc01.AsSingle()).AsByte();
234                          Vector128<byte> vsrc33 = Sse.MoveHighToLow(vsrc23.AsSingle(), vsrc23.AsSingle()).AsByte();
235  
236                          Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(vsrc01);
237                          Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(vsrc11);
238                          Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(vsrc23);
239                          Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(vsrc33);
240  
241                          Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
242  
243                          Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
244                      }
245                      dst += dstStride;
246                  }
247              }
248          }
249  
250          [MethodImpl(MethodImplOptions.AggressiveInlining)]
251          private static unsafe void ConvolveVert(
252              byte* src,
253              int srcStride,
254              byte* dst,
255              int dstStride,
256              Array8<short>[] yFilters,
257              int y0Q4,
258              int yStepQ4,
259              int w,
260              int h)
261          {
262              if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits)
263              {
264                  ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h);
265  
266                  return;
267              }
268  
269              int x, y;
270              src -= srcStride * (SubpelTaps / 2 - 1);
271  
272              for (x = 0; x < w; ++x)
273              {
274                  int yQ4 = y0Q4;
275                  for (y = 0; y < h; ++y)
276                  {
277                      byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
278                      ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
279                      int k, sum = 0;
280                      for (k = 0; k < SubpelTaps; ++k)
281                      {
282                          sum += srcY[k * srcStride] * yFilter[k];
283                      }
284  
285                      dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
286                      yQ4 += yStepQ4;
287                  }
288                  ++src;
289                  ++dst;
290              }
291          }
292  
293          private static unsafe void ConvolveAvgVert(
294              byte* src,
295              int srcStride,
296              byte* dst,
297              int dstStride,
298              Array8<short>[] yFilters,
299              int y0Q4,
300              int yStepQ4,
301              int w,
302              int h)
303          {
304              int x, y;
305              src -= srcStride * (SubpelTaps / 2 - 1);
306  
307              for (x = 0; x < w; ++x)
308              {
309                  int yQ4 = y0Q4;
310                  for (y = 0; y < h; ++y)
311                  {
312                      byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
313                      ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
314                      int k, sum = 0;
315                      for (k = 0; k < SubpelTaps; ++k)
316                      {
317                          sum += srcY[k * srcStride] * yFilter[k];
318                      }
319  
320                      dst[y * dstStride] = (byte)BitUtils.RoundPowerOfTwo(
321                          dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
322                      yQ4 += yStepQ4;
323                  }
324                  ++src;
325                  ++dst;
326              }
327          }
328  
329          public static unsafe void Convolve8Horiz(
330              byte* src,
331              int srcStride,
332              byte* dst,
333              int dstStride,
334              Array8<short>[] filter,
335              int x0Q4,
336              int xStepQ4,
337              int y0Q4,
338              int yStepQ4,
339              int w,
340              int h)
341          {
342              ConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
343          }
344  
345          public static unsafe void Convolve8AvgHoriz(
346              byte* src,
347              int srcStride,
348              byte* dst,
349              int dstStride,
350              Array8<short>[] filter,
351              int x0Q4,
352              int xStepQ4,
353              int y0Q4,
354              int yStepQ4,
355              int w,
356              int h)
357          {
358              ConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
359          }
360  
361          public static unsafe void Convolve8Vert(
362              byte* src,
363              int srcStride,
364              byte* dst,
365              int dstStride,
366              Array8<short>[] filter,
367              int x0Q4,
368              int xStepQ4,
369              int y0Q4,
370              int yStepQ4,
371              int w,
372              int h)
373          {
374              ConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
375          }
376  
377          public static unsafe void Convolve8AvgVert(
378              byte* src,
379              int srcStride,
380              byte* dst,
381              int dstStride,
382              Array8<short>[] filter,
383              int x0Q4,
384              int xStepQ4,
385              int y0Q4,
386              int yStepQ4,
387              int w,
388              int h)
389          {
390              ConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
391          }
392  
393          [SkipLocalsInit]
394          public static unsafe void Convolve8(
395              byte* src,
396              int srcStride,
397              byte* dst,
398              int dstStride,
399              Array8<short>[] filter,
400              int x0Q4,
401              int xStepQ4,
402              int y0Q4,
403              int yStepQ4,
404              int w,
405              int h)
406          {
407              // Note: Fixed size intermediate buffer, temp, places limits on parameters.
408              // 2d filtering proceeds in 2 steps:
409              //   (1) Interpolate horizontally into an intermediate buffer, temp.
410              //   (2) Interpolate temp vertically to derive the sub-pixel result.
411              // Deriving the maximum number of rows in the temp buffer (135):
412              // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
413              // --Largest block size is 64x64 pixels.
414              // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
415              //   original frame (in 1/16th pixel units).
416              // --Must round-up because block may be located at sub-pixel position.
417              // --Require an additional SubpelTaps rows for the 8-tap filter tails.
418              // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
419              // When calling in frame scaling function, the smallest scaling factor is x1/4
420              // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still
421              // big enough.
422              byte* temp = stackalloc byte[64 * 135];
423              int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
424  
425              Debug.Assert(w <= 64);
426              Debug.Assert(h <= 64);
427              Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32));
428              Debug.Assert(xStepQ4 <= 64);
429  
430              ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight);
431              ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
432          }
433  
434          public static unsafe void Convolve8Avg(
435              byte* src,
436              int srcStride,
437              byte* dst,
438              int dstStride,
439              Array8<short>[] filter,
440              int x0Q4,
441              int xStepQ4,
442              int y0Q4,
443              int yStepQ4,
444              int w,
445              int h)
446          {
447              // Fixed size intermediate buffer places limits on parameters.
448              byte* temp = stackalloc byte[64 * 64];
449              Debug.Assert(w <= 64);
450              Debug.Assert(h <= 64);
451  
452              Convolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
453              ConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h);
454          }
455  
456          public static unsafe void ConvolveCopy(
457              byte* src,
458              int srcStride,
459              byte* dst,
460              int dstStride,
461              Array8<short>[] filter,
462              int x0Q4,
463              int xStepQ4,
464              int y0Q4,
465              int yStepQ4,
466              int w,
467              int h)
468          {
469              int r;
470  
471              for (r = h; r > 0; --r)
472              {
473                  MemoryUtil.Copy(dst, src, w);
474                  src += srcStride;
475                  dst += dstStride;
476              }
477          }
478  
479          public static unsafe void ConvolveAvg(
480              byte* src,
481              int srcStride,
482              byte* dst,
483              int dstStride,
484              Array8<short>[] filter,
485              int x0Q4,
486              int xStepQ4,
487              int y0Q4,
488              int yStepQ4,
489              int w,
490              int h)
491          {
492              int x, y;
493  
494              for (y = 0; y < h; ++y)
495              {
496                  for (x = 0; x < w; ++x)
497                  {
498                      dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
499                  }
500  
501                  src += srcStride;
502                  dst += dstStride;
503              }
504          }
505  
506          public static unsafe void ScaledHoriz(
507              byte* src,
508              int srcStride,
509              byte* dst,
510              int dstStride,
511              Array8<short>[] filter,
512              int x0Q4,
513              int xStepQ4,
514              int y0Q4,
515              int yStepQ4,
516              int w,
517              int h)
518          {
519              Convolve8Horiz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
520          }
521  
522          public static unsafe void ScaledVert(
523              byte* src,
524              int srcStride,
525              byte* dst,
526              int dstStride,
527              Array8<short>[] filter,
528              int x0Q4,
529              int xStepQ4,
530              int y0Q4,
531              int yStepQ4,
532              int w,
533              int h)
534          {
535              Convolve8Vert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
536          }
537  
538          public static unsafe void Scaled2D(
539              byte* src,
540              int srcStride,
541              byte* dst,
542              int dstStride,
543              Array8<short>[] filter,
544              int x0Q4,
545              int xStepQ4,
546              int y0Q4,
547              int yStepQ4,
548              int w,
549              int h)
550          {
551              Convolve8(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
552          }
553  
554          public static unsafe void ScaledAvgHoriz(
555              byte* src,
556              int srcStride,
557              byte* dst,
558              int dstStride,
559              Array8<short>[] filter,
560              int x0Q4,
561              int xStepQ4,
562              int y0Q4,
563              int yStepQ4,
564              int w,
565              int h)
566          {
567              Convolve8AvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
568          }
569  
570          public static unsafe void ScaledAvgVert(
571              byte* src,
572              int srcStride,
573              byte* dst,
574              int dstStride,
575              Array8<short>[] filter,
576              int x0Q4,
577              int xStepQ4,
578              int y0Q4,
579              int yStepQ4,
580              int w,
581              int h)
582          {
583              Convolve8AvgVert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
584          }
585  
586          public static unsafe void ScaledAvg2D(
587              byte* src,
588              int srcStride,
589              byte* dst,
590              int dstStride,
591              Array8<short>[] filter,
592              int x0Q4,
593              int xStepQ4,
594              int y0Q4,
595              int yStepQ4,
596              int w,
597              int h)
598          {
599              Convolve8Avg(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
600          }
601  
602          private static unsafe void HighbdConvolveHoriz(
603              ushort* src,
604              int srcStride,
605              ushort* dst,
606              int dstStride,
607              Array8<short>[] xFilters,
608              int x0Q4,
609              int xStepQ4,
610              int w,
611              int h,
612              int bd)
613          {
614              int x, y;
615              src -= SubpelTaps / 2 - 1;
616  
617              for (y = 0; y < h; ++y)
618              {
619                  int xQ4 = x0Q4;
620                  for (x = 0; x < w; ++x)
621                  {
622                      ushort* srcX = &src[xQ4 >> SubpelBits];
623                      ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
624                      int k, sum = 0;
625                      for (k = 0; k < SubpelTaps; ++k)
626                      {
627                          sum += srcX[k] * xFilter[k];
628                      }
629  
630                      dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
631                      xQ4 += xStepQ4;
632                  }
633                  src += srcStride;
634                  dst += dstStride;
635              }
636          }
637  
638          private static unsafe void HighbdConvolveAvgHoriz(
639              ushort* src,
640              int srcStride,
641              ushort* dst,
642              int dstStride,
643              Array8<short>[] xFilters,
644              int x0Q4,
645              int xStepQ4,
646              int w,
647              int h,
648              int bd)
649          {
650              int x, y;
651              src -= SubpelTaps / 2 - 1;
652  
653              for (y = 0; y < h; ++y)
654              {
655                  int xQ4 = x0Q4;
656                  for (x = 0; x < w; ++x)
657                  {
658                      ushort* srcX = &src[xQ4 >> SubpelBits];
659                      ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
660                      int k, sum = 0;
661                      for (k = 0; k < SubpelTaps; ++k)
662                      {
663                          sum += srcX[k] * xFilter[k];
664                      }
665  
666                      dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
667                      xQ4 += xStepQ4;
668                  }
669                  src += srcStride;
670                  dst += dstStride;
671              }
672          }
673  
674          private static unsafe void HighbdConvolveVert(
675              ushort* src,
676              int srcStride,
677              ushort* dst,
678              int dstStride,
679              Array8<short>[] yFilters,
680              int y0Q4,
681              int yStepQ4,
682              int w,
683              int h,
684              int bd)
685          {
686              int x, y;
687              src -= srcStride * (SubpelTaps / 2 - 1);
688  
689              for (x = 0; x < w; ++x)
690              {
691                  int yQ4 = y0Q4;
692                  for (y = 0; y < h; ++y)
693                  {
694                      ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
695                      ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
696                      int k, sum = 0;
697                      for (k = 0; k < SubpelTaps; ++k)
698                      {
699                          sum += srcY[k * srcStride] * yFilter[k];
700                      }
701  
702                      dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
703                      yQ4 += yStepQ4;
704                  }
705                  ++src;
706                  ++dst;
707              }
708          }
709  
710          private static unsafe void HighConvolveAvgVert(
711              ushort* src,
712              int srcStride,
713              ushort* dst,
714              int dstStride,
715              Array8<short>[] yFilters,
716              int y0Q4,
717              int yStepQ4,
718              int w,
719              int h,
720              int bd)
721          {
722              int x, y;
723              src -= srcStride * (SubpelTaps / 2 - 1);
724  
725              for (x = 0; x < w; ++x)
726              {
727                  int yQ4 = y0Q4;
728                  for (y = 0; y < h; ++y)
729                  {
730                      ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
731                      ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
732                      int k, sum = 0;
733                      for (k = 0; k < SubpelTaps; ++k)
734                      {
735                          sum += srcY[k * srcStride] * yFilter[k];
736                      }
737  
738                      dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo(
739                          dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
740                      yQ4 += yStepQ4;
741                  }
742                  ++src;
743                  ++dst;
744              }
745          }
746  
747          private static unsafe void HighbdConvolve(
748              ushort* src,
749              int srcStride,
750              ushort* dst,
751              int dstStride,
752              Array8<short>[] filter,
753              int x0Q4,
754              int xStepQ4,
755              int y0Q4,
756              int yStepQ4,
757              int w,
758              int h,
759              int bd)
760          {
761              // Note: Fixed size intermediate buffer, temp, places limits on parameters.
762              // 2d filtering proceeds in 2 steps:
763              //   (1) Interpolate horizontally into an intermediate buffer, temp.
764              //   (2) Interpolate temp vertically to derive the sub-pixel result.
765              // Deriving the maximum number of rows in the temp buffer (135):
766              // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
767              // --Largest block size is 64x64 pixels.
768              // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
769              //   original frame (in 1/16th pixel units).
770              // --Must round-up because block may be located at sub-pixel position.
771              // --Require an additional SubpelTaps rows for the 8-tap filter tails.
772              // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
773              ushort* temp = stackalloc ushort[64 * 135];
774              int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
775  
776              Debug.Assert(w <= 64);
777              Debug.Assert(h <= 64);
778              Debug.Assert(yStepQ4 <= 32);
779              Debug.Assert(xStepQ4 <= 32);
780  
781              HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd);
782              HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
783          }
784  
785          public static unsafe void HighbdConvolve8Horiz(
786              ushort* src,
787              int srcStride,
788              ushort* dst,
789              int dstStride,
790              Array8<short>[] filter,
791              int x0Q4,
792              int xStepQ4,
793              int y0Q4,
794              int yStepQ4,
795              int w,
796              int h,
797              int bd)
798          {
799              HighbdConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
800          }
801  
802          public static unsafe void HighbdConvolve8AvgHoriz(
803              ushort* src,
804              int srcStride,
805              ushort* dst,
806              int dstStride,
807              Array8<short>[] filter,
808              int x0Q4,
809              int xStepQ4,
810              int y0Q4,
811              int yStepQ4,
812              int w,
813              int h,
814              int bd)
815          {
816              HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
817          }
818  
819          public static unsafe void HighbdConvolve8Vert(
820              ushort* src,
821              int srcStride,
822              ushort* dst,
823              int dstStride,
824              Array8<short>[] filter,
825              int x0Q4,
826              int xStepQ4,
827              int y0Q4,
828              int yStepQ4,
829              int w,
830              int h,
831              int bd)
832          {
833              HighbdConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
834          }
835  
836          public static unsafe void HighbdConvolve8AvgVert(
837              ushort* src,
838              int srcStride,
839              ushort* dst,
840              int dstStride,
841              Array8<short>[] filter,
842              int x0Q4,
843              int xStepQ4,
844              int y0Q4,
845              int yStepQ4,
846              int w,
847              int h,
848              int bd)
849          {
850              HighConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
851          }
852  
853          public static unsafe void HighbdConvolve8(
854              ushort* src,
855              int srcStride,
856              ushort* dst,
857              int dstStride,
858              Array8<short>[] filter,
859              int x0Q4,
860              int xStepQ4,
861              int y0Q4,
862              int yStepQ4,
863              int w,
864              int h,
865              int bd)
866          {
867              HighbdConvolve(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
868          }
869  
870          public static unsafe void HighbdConvolve8Avg(
871              ushort* src,
872              int srcStride,
873              ushort* dst,
874              int dstStride,
875              Array8<short>[] filter,
876              int x0Q4,
877              int xStepQ4,
878              int y0Q4,
879              int yStepQ4,
880              int w,
881              int h,
882              int bd)
883          {
884              // Fixed size intermediate buffer places limits on parameters.
885              ushort* temp = stackalloc ushort[64 * 64];
886              Debug.Assert(w <= 64);
887              Debug.Assert(h <= 64);
888  
889              HighbdConvolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
890              HighbdConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h, bd);
891          }
892  
893          public static unsafe void HighbdConvolveCopy(
894              ushort* src,
895              int srcStride,
896              ushort* dst,
897              int dstStride,
898              Array8<short>[] filter,
899              int x0Q4,
900              int xStepQ4,
901              int y0Q4,
902              int yStepQ4,
903              int w,
904              int h,
905              int bd)
906          {
907              int r;
908  
909              for (r = h; r > 0; --r)
910              {
911                  MemoryUtil.Copy(dst, src, w);
912                  src += srcStride;
913                  dst += dstStride;
914              }
915          }
916  
917          public static unsafe void HighbdConvolveAvg(
918              ushort* src,
919              int srcStride,
920              ushort* dst,
921              int dstStride,
922              Array8<short>[] filter,
923              int x0Q4,
924              int xStepQ4,
925              int y0Q4,
926              int yStepQ4,
927              int w,
928              int h,
929              int bd)
930          {
931              int x, y;
932  
933              for (y = 0; y < h; ++y)
934              {
935                  for (x = 0; x < w; ++x)
936                  {
937                      dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
938                  }
939  
940                  src += srcStride;
941                  dst += dstStride;
942              }
943          }
944      }
945  }