Convolve.cs
1 using Ryujinx.Common.Memory; 2 using Ryujinx.Graphics.Nvdec.Vp9.Common; 3 using System.Diagnostics; 4 using System.Runtime.CompilerServices; 5 using System.Runtime.Intrinsics; 6 using System.Runtime.Intrinsics.X86; 7 using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter; 8 9 namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp 10 { 11 internal static class Convolve 12 { 13 private const bool UseIntrinsics = true; 14 15 [MethodImpl(MethodImplOptions.AggressiveInlining)] 16 private static Vector128<int> MultiplyAddAdjacent( 17 Vector128<short> vsrc0, 18 Vector128<short> vsrc1, 19 Vector128<short> vsrc2, 20 Vector128<short> vsrc3, 21 Vector128<short> vfilter, 22 Vector128<int> zero) 23 { 24 // < sumN, sumN, sumN, sumN > 25 Vector128<int> sum0 = Sse2.MultiplyAddAdjacent(vsrc0, vfilter); 26 Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(vsrc1, vfilter); 27 Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(vsrc2, vfilter); 28 Vector128<int> sum3 = Sse2.MultiplyAddAdjacent(vsrc3, vfilter); 29 30 // < 0, 0, sumN, sumN > 31 sum0 = Ssse3.HorizontalAdd(sum0, zero); 32 sum1 = Ssse3.HorizontalAdd(sum1, zero); 33 sum2 = Ssse3.HorizontalAdd(sum2, zero); 34 sum3 = Ssse3.HorizontalAdd(sum3, zero); 35 36 // < 0, 0, 0, sumN > 37 sum0 = Ssse3.HorizontalAdd(sum0, zero); 38 sum1 = Ssse3.HorizontalAdd(sum1, zero); 39 sum2 = Ssse3.HorizontalAdd(sum2, zero); 40 sum3 = Ssse3.HorizontalAdd(sum3, zero); 41 42 // < 0, 0, sum1, sum0 > 43 Vector128<int> sum01 = Sse2.UnpackLow(sum0, sum1); 44 45 // < 0, 0, sum3, sum2 > 46 Vector128<int> sum23 = Sse2.UnpackLow(sum2, sum3); 47 48 // < sum3, sum2, sum1, sum0 > 49 return Sse.MoveLowToHigh(sum01.AsSingle(), sum23.AsSingle()).AsInt32(); 50 } 51 52 [MethodImpl(MethodImplOptions.AggressiveInlining)] 53 private static Vector128<int> RoundShift(Vector128<int> value, Vector128<int> const64) 54 { 55 return Sse2.ShiftRightArithmetic(Sse2.Add(value, const64), FilterBits); 56 } 57 58 [MethodImpl(MethodImplOptions.AggressiveInlining)] 59 private static Vector128<byte> PackUnsignedSaturate(Vector128<int> value, Vector128<int> zero) 60 { 61 return Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16()); 62 } 63 64 [MethodImpl(MethodImplOptions.AggressiveInlining)] 65 private static unsafe void ConvolveHorizSse41( 66 byte* src, 67 int srcStride, 68 byte* dst, 69 int dstStride, 70 Array8<short>[] xFilters, 71 int x0Q4, 72 int w, 73 int h) 74 { 75 Vector128<int> zero = Vector128<int>.Zero; 76 Vector128<int> const64 = Vector128.Create(64); 77 78 ulong x, y; 79 src -= SubpelTaps / 2 - 1; 80 81 fixed (Array8<short>* xFilter = xFilters) 82 { 83 Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8); 84 85 for (y = 0; y < (uint)h; ++y) 86 { 87 ulong srcOffset = (uint)x0Q4 >> SubpelBits; 88 for (x = 0; x < (uint)w; x += 4) 89 { 90 Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]); 91 Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]); 92 Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]); 93 Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]); 94 95 Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero); 96 97 Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); 98 } 99 src += srcStride; 100 dst += dstStride; 101 } 102 } 103 } 104 105 [MethodImpl(MethodImplOptions.AggressiveInlining)] 106 private static unsafe void ConvolveHoriz( 107 byte* src, 108 int srcStride, 109 byte* dst, 110 int dstStride, 111 Array8<short>[] xFilters, 112 int x0Q4, 113 int xStepQ4, 114 int w, 115 int h) 116 { 117 if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits) 118 { 119 ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h); 120 121 return; 122 } 123 124 int x, y; 125 src -= SubpelTaps / 2 - 1; 126 127 for (y = 0; y < h; ++y) 128 { 129 int xQ4 = x0Q4; 130 for (x = 0; x < w; ++x) 131 { 132 byte* srcX = &src[xQ4 >> SubpelBits]; 133 ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask]; 134 int k, sum = 0; 135 for (k = 0; k < SubpelTaps; ++k) 136 { 137 sum += srcX[k] * xFilter[k]; 138 } 139 140 dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)); 141 xQ4 += xStepQ4; 142 } 143 src += srcStride; 144 dst += dstStride; 145 } 146 } 147 148 private static unsafe void ConvolveAvgHoriz( 149 byte* src, 150 int srcStride, 151 byte* dst, 152 int dstStride, 153 Array8<short>[] xFilters, 154 int x0Q4, 155 int xStepQ4, 156 int w, 157 int h) 158 { 159 int x, y; 160 src -= SubpelTaps / 2 - 1; 161 162 for (y = 0; y < h; ++y) 163 { 164 int xQ4 = x0Q4; 165 for (x = 0; x < w; ++x) 166 { 167 byte* srcX = &src[xQ4 >> SubpelBits]; 168 ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask]; 169 int k, sum = 0; 170 for (k = 0; k < SubpelTaps; ++k) 171 { 172 sum += srcX[k] * xFilter[k]; 173 } 174 175 dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1); 176 xQ4 += xStepQ4; 177 } 178 src += srcStride; 179 dst += dstStride; 180 } 181 } 182 183 [MethodImpl(MethodImplOptions.AggressiveInlining)] 184 private static unsafe void ConvolveVertAvx2( 185 byte* src, 186 int srcStride, 187 byte* dst, 188 int dstStride, 189 Array8<short>[] yFilters, 190 int y0Q4, 191 int w, 192 int h) 193 { 194 Vector128<int> zero = Vector128<int>.Zero; 195 Vector128<int> const64 = Vector128.Create(64); 196 Vector256<int> indices = Vector256.Create( 197 0, 198 srcStride, 199 srcStride * 2, 200 srcStride * 3, 201 srcStride * 4, 202 srcStride * 5, 203 srcStride * 6, 204 srcStride * 7); 205 206 ulong x, y; 207 src -= srcStride * (SubpelTaps / 2 - 1); 208 209 fixed (Array8<short>* yFilter = yFilters) 210 { 211 Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8); 212 213 ulong srcBaseY = (uint)y0Q4 >> SubpelBits; 214 for (y = 0; y < (uint)h; ++y) 215 { 216 ulong srcOffset = (srcBaseY + y) * (uint)srcStride; 217 for (x = 0; x < (uint)w; x += 4) 218 { 219 Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32(); 220 221 Vector128<int> vsrcL = vsrc.GetLower(); 222 Vector128<int> vsrcH = vsrc.GetUpper(); 223 224 Vector128<byte> vsrcUnpck11 = Sse2.UnpackLow(vsrcL.AsByte(), vsrcH.AsByte()); 225 Vector128<byte> vsrcUnpck12 = Sse2.UnpackHigh(vsrcL.AsByte(), vsrcH.AsByte()); 226 227 Vector128<byte> vsrcUnpck21 = Sse2.UnpackLow(vsrcUnpck11, vsrcUnpck12); 228 Vector128<byte> vsrcUnpck22 = Sse2.UnpackHigh(vsrcUnpck11, vsrcUnpck12); 229 230 Vector128<byte> vsrc01 = Sse2.UnpackLow(vsrcUnpck21, vsrcUnpck22); 231 Vector128<byte> vsrc23 = Sse2.UnpackHigh(vsrcUnpck21, vsrcUnpck22); 232 233 Vector128<byte> vsrc11 = Sse.MoveHighToLow(vsrc01.AsSingle(), vsrc01.AsSingle()).AsByte(); 234 Vector128<byte> vsrc33 = Sse.MoveHighToLow(vsrc23.AsSingle(), vsrc23.AsSingle()).AsByte(); 235 236 Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(vsrc01); 237 Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(vsrc11); 238 Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(vsrc23); 239 Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(vsrc33); 240 241 Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero); 242 243 Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); 244 } 245 dst += dstStride; 246 } 247 } 248 } 249 250 [MethodImpl(MethodImplOptions.AggressiveInlining)] 251 private static unsafe void ConvolveVert( 252 byte* src, 253 int srcStride, 254 byte* dst, 255 int dstStride, 256 Array8<short>[] yFilters, 257 int y0Q4, 258 int yStepQ4, 259 int w, 260 int h) 261 { 262 if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits) 263 { 264 ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h); 265 266 return; 267 } 268 269 int x, y; 270 src -= srcStride * (SubpelTaps / 2 - 1); 271 272 for (x = 0; x < w; ++x) 273 { 274 int yQ4 = y0Q4; 275 for (y = 0; y < h; ++y) 276 { 277 byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; 278 ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask]; 279 int k, sum = 0; 280 for (k = 0; k < SubpelTaps; ++k) 281 { 282 sum += srcY[k * srcStride] * yFilter[k]; 283 } 284 285 dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)); 286 yQ4 += yStepQ4; 287 } 288 ++src; 289 ++dst; 290 } 291 } 292 293 private static unsafe void ConvolveAvgVert( 294 byte* src, 295 int srcStride, 296 byte* dst, 297 int dstStride, 298 Array8<short>[] yFilters, 299 int y0Q4, 300 int yStepQ4, 301 int w, 302 int h) 303 { 304 int x, y; 305 src -= srcStride * (SubpelTaps / 2 - 1); 306 307 for (x = 0; x < w; ++x) 308 { 309 int yQ4 = y0Q4; 310 for (y = 0; y < h; ++y) 311 { 312 byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; 313 ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask]; 314 int k, sum = 0; 315 for (k = 0; k < SubpelTaps; ++k) 316 { 317 sum += srcY[k * srcStride] * yFilter[k]; 318 } 319 320 dst[y * dstStride] = (byte)BitUtils.RoundPowerOfTwo( 321 dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1); 322 yQ4 += yStepQ4; 323 } 324 ++src; 325 ++dst; 326 } 327 } 328 329 public static unsafe void Convolve8Horiz( 330 byte* src, 331 int srcStride, 332 byte* dst, 333 int dstStride, 334 Array8<short>[] filter, 335 int x0Q4, 336 int xStepQ4, 337 int y0Q4, 338 int yStepQ4, 339 int w, 340 int h) 341 { 342 ConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h); 343 } 344 345 public static unsafe void Convolve8AvgHoriz( 346 byte* src, 347 int srcStride, 348 byte* dst, 349 int dstStride, 350 Array8<short>[] filter, 351 int x0Q4, 352 int xStepQ4, 353 int y0Q4, 354 int yStepQ4, 355 int w, 356 int h) 357 { 358 ConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h); 359 } 360 361 public static unsafe void Convolve8Vert( 362 byte* src, 363 int srcStride, 364 byte* dst, 365 int dstStride, 366 Array8<short>[] filter, 367 int x0Q4, 368 int xStepQ4, 369 int y0Q4, 370 int yStepQ4, 371 int w, 372 int h) 373 { 374 ConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h); 375 } 376 377 public static unsafe void Convolve8AvgVert( 378 byte* src, 379 int srcStride, 380 byte* dst, 381 int dstStride, 382 Array8<short>[] filter, 383 int x0Q4, 384 int xStepQ4, 385 int y0Q4, 386 int yStepQ4, 387 int w, 388 int h) 389 { 390 ConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h); 391 } 392 393 [SkipLocalsInit] 394 public static unsafe void Convolve8( 395 byte* src, 396 int srcStride, 397 byte* dst, 398 int dstStride, 399 Array8<short>[] filter, 400 int x0Q4, 401 int xStepQ4, 402 int y0Q4, 403 int yStepQ4, 404 int w, 405 int h) 406 { 407 // Note: Fixed size intermediate buffer, temp, places limits on parameters. 408 // 2d filtering proceeds in 2 steps: 409 // (1) Interpolate horizontally into an intermediate buffer, temp. 410 // (2) Interpolate temp vertically to derive the sub-pixel result. 411 // Deriving the maximum number of rows in the temp buffer (135): 412 // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative). 413 // --Largest block size is 64x64 pixels. 414 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 415 // original frame (in 1/16th pixel units). 416 // --Must round-up because block may be located at sub-pixel position. 417 // --Require an additional SubpelTaps rows for the 8-tap filter tails. 418 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 419 // When calling in frame scaling function, the smallest scaling factor is x1/4 420 // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still 421 // big enough. 422 byte* temp = stackalloc byte[64 * 135]; 423 int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps; 424 425 Debug.Assert(w <= 64); 426 Debug.Assert(h <= 64); 427 Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32)); 428 Debug.Assert(xStepQ4 <= 64); 429 430 ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight); 431 ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h); 432 } 433 434 public static unsafe void Convolve8Avg( 435 byte* src, 436 int srcStride, 437 byte* dst, 438 int dstStride, 439 Array8<short>[] filter, 440 int x0Q4, 441 int xStepQ4, 442 int y0Q4, 443 int yStepQ4, 444 int w, 445 int h) 446 { 447 // Fixed size intermediate buffer places limits on parameters. 448 byte* temp = stackalloc byte[64 * 64]; 449 Debug.Assert(w <= 64); 450 Debug.Assert(h <= 64); 451 452 Convolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); 453 ConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h); 454 } 455 456 public static unsafe void ConvolveCopy( 457 byte* src, 458 int srcStride, 459 byte* dst, 460 int dstStride, 461 Array8<short>[] filter, 462 int x0Q4, 463 int xStepQ4, 464 int y0Q4, 465 int yStepQ4, 466 int w, 467 int h) 468 { 469 int r; 470 471 for (r = h; r > 0; --r) 472 { 473 MemoryUtil.Copy(dst, src, w); 474 src += srcStride; 475 dst += dstStride; 476 } 477 } 478 479 public static unsafe void ConvolveAvg( 480 byte* src, 481 int srcStride, 482 byte* dst, 483 int dstStride, 484 Array8<short>[] filter, 485 int x0Q4, 486 int xStepQ4, 487 int y0Q4, 488 int yStepQ4, 489 int w, 490 int h) 491 { 492 int x, y; 493 494 for (y = 0; y < h; ++y) 495 { 496 for (x = 0; x < w; ++x) 497 { 498 dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1); 499 } 500 501 src += srcStride; 502 dst += dstStride; 503 } 504 } 505 506 public static unsafe void ScaledHoriz( 507 byte* src, 508 int srcStride, 509 byte* dst, 510 int dstStride, 511 Array8<short>[] filter, 512 int x0Q4, 513 int xStepQ4, 514 int y0Q4, 515 int yStepQ4, 516 int w, 517 int h) 518 { 519 Convolve8Horiz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); 520 } 521 522 public static unsafe void ScaledVert( 523 byte* src, 524 int srcStride, 525 byte* dst, 526 int dstStride, 527 Array8<short>[] filter, 528 int x0Q4, 529 int xStepQ4, 530 int y0Q4, 531 int yStepQ4, 532 int w, 533 int h) 534 { 535 Convolve8Vert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); 536 } 537 538 public static unsafe void Scaled2D( 539 byte* src, 540 int srcStride, 541 byte* dst, 542 int dstStride, 543 Array8<short>[] filter, 544 int x0Q4, 545 int xStepQ4, 546 int y0Q4, 547 int yStepQ4, 548 int w, 549 int h) 550 { 551 Convolve8(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); 552 } 553 554 public static unsafe void ScaledAvgHoriz( 555 byte* src, 556 int srcStride, 557 byte* dst, 558 int dstStride, 559 Array8<short>[] filter, 560 int x0Q4, 561 int xStepQ4, 562 int y0Q4, 563 int yStepQ4, 564 int w, 565 int h) 566 { 567 Convolve8AvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); 568 } 569 570 public static unsafe void ScaledAvgVert( 571 byte* src, 572 int srcStride, 573 byte* dst, 574 int dstStride, 575 Array8<short>[] filter, 576 int x0Q4, 577 int xStepQ4, 578 int y0Q4, 579 int yStepQ4, 580 int w, 581 int h) 582 { 583 Convolve8AvgVert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); 584 } 585 586 public static unsafe void ScaledAvg2D( 587 byte* src, 588 int srcStride, 589 byte* dst, 590 int dstStride, 591 Array8<short>[] filter, 592 int x0Q4, 593 int xStepQ4, 594 int y0Q4, 595 int yStepQ4, 596 int w, 597 int h) 598 { 599 Convolve8Avg(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h); 600 } 601 602 private static unsafe void HighbdConvolveHoriz( 603 ushort* src, 604 int srcStride, 605 ushort* dst, 606 int dstStride, 607 Array8<short>[] xFilters, 608 int x0Q4, 609 int xStepQ4, 610 int w, 611 int h, 612 int bd) 613 { 614 int x, y; 615 src -= SubpelTaps / 2 - 1; 616 617 for (y = 0; y < h; ++y) 618 { 619 int xQ4 = x0Q4; 620 for (x = 0; x < w; ++x) 621 { 622 ushort* srcX = &src[xQ4 >> SubpelBits]; 623 ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask]; 624 int k, sum = 0; 625 for (k = 0; k < SubpelTaps; ++k) 626 { 627 sum += srcX[k] * xFilter[k]; 628 } 629 630 dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd); 631 xQ4 += xStepQ4; 632 } 633 src += srcStride; 634 dst += dstStride; 635 } 636 } 637 638 private static unsafe void HighbdConvolveAvgHoriz( 639 ushort* src, 640 int srcStride, 641 ushort* dst, 642 int dstStride, 643 Array8<short>[] xFilters, 644 int x0Q4, 645 int xStepQ4, 646 int w, 647 int h, 648 int bd) 649 { 650 int x, y; 651 src -= SubpelTaps / 2 - 1; 652 653 for (y = 0; y < h; ++y) 654 { 655 int xQ4 = x0Q4; 656 for (x = 0; x < w; ++x) 657 { 658 ushort* srcX = &src[xQ4 >> SubpelBits]; 659 ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask]; 660 int k, sum = 0; 661 for (k = 0; k < SubpelTaps; ++k) 662 { 663 sum += srcX[k] * xFilter[k]; 664 } 665 666 dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1); 667 xQ4 += xStepQ4; 668 } 669 src += srcStride; 670 dst += dstStride; 671 } 672 } 673 674 private static unsafe void HighbdConvolveVert( 675 ushort* src, 676 int srcStride, 677 ushort* dst, 678 int dstStride, 679 Array8<short>[] yFilters, 680 int y0Q4, 681 int yStepQ4, 682 int w, 683 int h, 684 int bd) 685 { 686 int x, y; 687 src -= srcStride * (SubpelTaps / 2 - 1); 688 689 for (x = 0; x < w; ++x) 690 { 691 int yQ4 = y0Q4; 692 for (y = 0; y < h; ++y) 693 { 694 ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; 695 ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask]; 696 int k, sum = 0; 697 for (k = 0; k < SubpelTaps; ++k) 698 { 699 sum += srcY[k * srcStride] * yFilter[k]; 700 } 701 702 dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd); 703 yQ4 += yStepQ4; 704 } 705 ++src; 706 ++dst; 707 } 708 } 709 710 private static unsafe void HighConvolveAvgVert( 711 ushort* src, 712 int srcStride, 713 ushort* dst, 714 int dstStride, 715 Array8<short>[] yFilters, 716 int y0Q4, 717 int yStepQ4, 718 int w, 719 int h, 720 int bd) 721 { 722 int x, y; 723 src -= srcStride * (SubpelTaps / 2 - 1); 724 725 for (x = 0; x < w; ++x) 726 { 727 int yQ4 = y0Q4; 728 for (y = 0; y < h; ++y) 729 { 730 ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; 731 ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask]; 732 int k, sum = 0; 733 for (k = 0; k < SubpelTaps; ++k) 734 { 735 sum += srcY[k * srcStride] * yFilter[k]; 736 } 737 738 dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo( 739 dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1); 740 yQ4 += yStepQ4; 741 } 742 ++src; 743 ++dst; 744 } 745 } 746 747 private static unsafe void HighbdConvolve( 748 ushort* src, 749 int srcStride, 750 ushort* dst, 751 int dstStride, 752 Array8<short>[] filter, 753 int x0Q4, 754 int xStepQ4, 755 int y0Q4, 756 int yStepQ4, 757 int w, 758 int h, 759 int bd) 760 { 761 // Note: Fixed size intermediate buffer, temp, places limits on parameters. 762 // 2d filtering proceeds in 2 steps: 763 // (1) Interpolate horizontally into an intermediate buffer, temp. 764 // (2) Interpolate temp vertically to derive the sub-pixel result. 765 // Deriving the maximum number of rows in the temp buffer (135): 766 // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative). 767 // --Largest block size is 64x64 pixels. 768 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 769 // original frame (in 1/16th pixel units). 770 // --Must round-up because block may be located at sub-pixel position. 771 // --Require an additional SubpelTaps rows for the 8-tap filter tails. 772 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 773 ushort* temp = stackalloc ushort[64 * 135]; 774 int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps; 775 776 Debug.Assert(w <= 64); 777 Debug.Assert(h <= 64); 778 Debug.Assert(yStepQ4 <= 32); 779 Debug.Assert(xStepQ4 <= 32); 780 781 HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd); 782 HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd); 783 } 784 785 public static unsafe void HighbdConvolve8Horiz( 786 ushort* src, 787 int srcStride, 788 ushort* dst, 789 int dstStride, 790 Array8<short>[] filter, 791 int x0Q4, 792 int xStepQ4, 793 int y0Q4, 794 int yStepQ4, 795 int w, 796 int h, 797 int bd) 798 { 799 HighbdConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd); 800 } 801 802 public static unsafe void HighbdConvolve8AvgHoriz( 803 ushort* src, 804 int srcStride, 805 ushort* dst, 806 int dstStride, 807 Array8<short>[] filter, 808 int x0Q4, 809 int xStepQ4, 810 int y0Q4, 811 int yStepQ4, 812 int w, 813 int h, 814 int bd) 815 { 816 HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd); 817 } 818 819 public static unsafe void HighbdConvolve8Vert( 820 ushort* src, 821 int srcStride, 822 ushort* dst, 823 int dstStride, 824 Array8<short>[] filter, 825 int x0Q4, 826 int xStepQ4, 827 int y0Q4, 828 int yStepQ4, 829 int w, 830 int h, 831 int bd) 832 { 833 HighbdConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd); 834 } 835 836 public static unsafe void HighbdConvolve8AvgVert( 837 ushort* src, 838 int srcStride, 839 ushort* dst, 840 int dstStride, 841 Array8<short>[] filter, 842 int x0Q4, 843 int xStepQ4, 844 int y0Q4, 845 int yStepQ4, 846 int w, 847 int h, 848 int bd) 849 { 850 HighConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd); 851 } 852 853 public static unsafe void HighbdConvolve8( 854 ushort* src, 855 int srcStride, 856 ushort* dst, 857 int dstStride, 858 Array8<short>[] filter, 859 int x0Q4, 860 int xStepQ4, 861 int y0Q4, 862 int yStepQ4, 863 int w, 864 int h, 865 int bd) 866 { 867 HighbdConvolve(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd); 868 } 869 870 public static unsafe void HighbdConvolve8Avg( 871 ushort* src, 872 int srcStride, 873 ushort* dst, 874 int dstStride, 875 Array8<short>[] filter, 876 int x0Q4, 877 int xStepQ4, 878 int y0Q4, 879 int yStepQ4, 880 int w, 881 int h, 882 int bd) 883 { 884 // Fixed size intermediate buffer places limits on parameters. 885 ushort* temp = stackalloc ushort[64 * 64]; 886 Debug.Assert(w <= 64); 887 Debug.Assert(h <= 64); 888 889 HighbdConvolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd); 890 HighbdConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h, bd); 891 } 892 893 public static unsafe void HighbdConvolveCopy( 894 ushort* src, 895 int srcStride, 896 ushort* dst, 897 int dstStride, 898 Array8<short>[] filter, 899 int x0Q4, 900 int xStepQ4, 901 int y0Q4, 902 int yStepQ4, 903 int w, 904 int h, 905 int bd) 906 { 907 int r; 908 909 for (r = h; r > 0; --r) 910 { 911 MemoryUtil.Copy(dst, src, w); 912 src += srcStride; 913 dst += dstStride; 914 } 915 } 916 917 public static unsafe void HighbdConvolveAvg( 918 ushort* src, 919 int srcStride, 920 ushort* dst, 921 int dstStride, 922 Array8<short>[] filter, 923 int x0Q4, 924 int xStepQ4, 925 int y0Q4, 926 int yStepQ4, 927 int w, 928 int h, 929 int bd) 930 { 931 int x, y; 932 933 for (y = 0; y < h; ++y) 934 { 935 for (x = 0; x < w; ++x) 936 { 937 dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1); 938 } 939 940 src += srcStride; 941 dst += dstStride; 942 } 943 } 944 } 945 }