InstEmitMemory.cs
1 using Ryujinx.Graphics.Shader.Decoders; 2 using Ryujinx.Graphics.Shader.IntermediateRepresentation; 3 using Ryujinx.Graphics.Shader.Translation; 4 using System.Numerics; 5 using static Ryujinx.Graphics.Shader.Instructions.InstEmitHelper; 6 using static Ryujinx.Graphics.Shader.IntermediateRepresentation.OperandHelper; 7 8 namespace Ryujinx.Graphics.Shader.Instructions 9 { 10 static partial class InstEmit 11 { 12 public static void Atom(EmitterContext context) 13 { 14 InstAtom op = context.GetOp<InstAtom>(); 15 16 int sOffset = (op.Imm20 << 12) >> 12; 17 18 (Operand addrLow, Operand addrHigh) = Get40BitsAddress(context, new Register(op.SrcA, RegisterType.Gpr), op.E, sOffset); 19 20 Operand value = GetSrcReg(context, op.SrcB); 21 22 Operand res = EmitAtomicOp(context, StorageKind.GlobalMemory, op.Op, op.Size, addrLow, addrHigh, value); 23 24 context.Copy(GetDest(op.Dest), res); 25 } 26 27 public static void Atoms(EmitterContext context) 28 { 29 if (context.TranslatorContext.Definitions.Stage != ShaderStage.Compute) 30 { 31 context.TranslatorContext.GpuAccessor.Log($"Atoms instruction is not valid on \"{context.TranslatorContext.Definitions.Stage}\" stage."); 32 return; 33 } 34 35 InstAtoms op = context.GetOp<InstAtoms>(); 36 37 Operand offset = context.ShiftRightU32(GetSrcReg(context, op.SrcA), Const(2)); 38 39 int sOffset = (op.Imm22 << 10) >> 10; 40 41 offset = context.IAdd(offset, Const(sOffset)); 42 43 Operand value = GetSrcReg(context, op.SrcB); 44 45 AtomSize size = op.AtomsSize switch 46 { 47 AtomsSize.S32 => AtomSize.S32, 48 AtomsSize.U64 => AtomSize.U64, 49 AtomsSize.S64 => AtomSize.S64, 50 _ => AtomSize.U32, 51 }; 52 53 Operand id = Const(context.ResourceManager.SharedMemoryId); 54 Operand res = EmitAtomicOp(context, StorageKind.SharedMemory, op.AtomOp, size, id, offset, value); 55 56 context.Copy(GetDest(op.Dest), res); 57 } 58 59 public static void Ldc(EmitterContext context) 60 { 61 InstLdc op = context.GetOp<InstLdc>(); 62 63 if (op.LsSize > LsSize2.B64) 64 { 65 context.TranslatorContext.GpuAccessor.Log($"Invalid LDC size: {op.LsSize}."); 66 return; 67 } 68 69 bool isSmallInt = op.LsSize < LsSize2.B32; 70 71 int count = op.LsSize == LsSize2.B64 ? 2 : 1; 72 73 Operand slot = Const(op.CbufSlot); 74 Operand srcA = GetSrcReg(context, op.SrcA); 75 76 if (op.AddressMode == AddressMode.Is || op.AddressMode == AddressMode.Isl) 77 { 78 slot = context.IAdd(slot, context.BitfieldExtractU32(srcA, Const(16), Const(16))); 79 srcA = context.BitwiseAnd(srcA, Const(0xffff)); 80 } 81 82 Operand addr = context.IAdd(srcA, Const(Imm16ToSInt(op.CbufOffset))); 83 Operand wordOffset = context.ShiftRightU32(addr, Const(2)); 84 85 for (int index = 0; index < count; index++) 86 { 87 Register dest = new(op.Dest + index, RegisterType.Gpr); 88 89 if (dest.IsRZ) 90 { 91 break; 92 } 93 94 Operand offset = context.IAdd(wordOffset, Const(index)); 95 Operand value = EmitLoadConstant(context, slot, offset); 96 97 if (isSmallInt) 98 { 99 value = ExtractSmallInt(context, (LsSize)op.LsSize, GetBitOffset(context, addr), value); 100 } 101 102 context.Copy(Register(dest), value); 103 } 104 } 105 106 public static void Ldg(EmitterContext context) 107 { 108 InstLdg op = context.GetOp<InstLdg>(); 109 110 EmitLdg(context, op.LsSize, op.SrcA, op.Dest, Imm24ToSInt(op.Imm24), op.E); 111 } 112 113 public static void Ldl(EmitterContext context) 114 { 115 InstLdl op = context.GetOp<InstLdl>(); 116 117 EmitLoad(context, StorageKind.LocalMemory, op.LsSize, GetSrcReg(context, op.SrcA), op.Dest, Imm24ToSInt(op.Imm24)); 118 } 119 120 public static void Lds(EmitterContext context) 121 { 122 if (context.TranslatorContext.Definitions.Stage != ShaderStage.Compute) 123 { 124 context.TranslatorContext.GpuAccessor.Log($"Lds instruction is not valid on \"{context.TranslatorContext.Definitions.Stage}\" stage."); 125 return; 126 } 127 128 InstLds op = context.GetOp<InstLds>(); 129 130 EmitLoad(context, StorageKind.SharedMemory, op.LsSize, GetSrcReg(context, op.SrcA), op.Dest, Imm24ToSInt(op.Imm24)); 131 } 132 133 public static void Red(EmitterContext context) 134 { 135 InstRed op = context.GetOp<InstRed>(); 136 137 (Operand addrLow, Operand addrHigh) = Get40BitsAddress(context, new Register(op.SrcA, RegisterType.Gpr), op.E, op.Imm20); 138 139 EmitAtomicOp(context, StorageKind.GlobalMemory, (AtomOp)op.RedOp, op.RedSize, addrLow, addrHigh, GetDest(op.SrcB)); 140 } 141 142 public static void Stg(EmitterContext context) 143 { 144 InstStg op = context.GetOp<InstStg>(); 145 146 EmitStg(context, op.LsSize, op.SrcA, op.Dest, Imm24ToSInt(op.Imm24), op.E); 147 } 148 149 public static void Stl(EmitterContext context) 150 { 151 InstStl op = context.GetOp<InstStl>(); 152 153 EmitStore(context, StorageKind.LocalMemory, op.LsSize, GetSrcReg(context, op.SrcA), op.Dest, Imm24ToSInt(op.Imm24)); 154 } 155 156 public static void Sts(EmitterContext context) 157 { 158 if (context.TranslatorContext.Definitions.Stage != ShaderStage.Compute) 159 { 160 context.TranslatorContext.GpuAccessor.Log($"Sts instruction is not valid on \"{context.TranslatorContext.Definitions.Stage}\" stage."); 161 return; 162 } 163 164 InstSts op = context.GetOp<InstSts>(); 165 166 EmitStore(context, StorageKind.SharedMemory, op.LsSize, GetSrcReg(context, op.SrcA), op.Dest, Imm24ToSInt(op.Imm24)); 167 } 168 169 private static Operand EmitLoadConstant(EmitterContext context, Operand slot, Operand offset) 170 { 171 Operand vecIndex = context.ShiftRightU32(offset, Const(2)); 172 Operand elemIndex = context.BitwiseAnd(offset, Const(3)); 173 174 if (slot.Type == OperandType.Constant) 175 { 176 int binding = context.ResourceManager.GetConstantBufferBinding(slot.Value); 177 return context.Load(StorageKind.ConstantBuffer, binding, Const(0), vecIndex, elemIndex); 178 } 179 else 180 { 181 Operand value = Const(0); 182 183 uint cbUseMask = context.TranslatorContext.GpuAccessor.QueryConstantBufferUse(); 184 185 while (cbUseMask != 0) 186 { 187 int cbIndex = BitOperations.TrailingZeroCount(cbUseMask); 188 int binding = context.ResourceManager.GetConstantBufferBinding(cbIndex); 189 190 Operand isCurrent = context.ICompareEqual(slot, Const(cbIndex)); 191 Operand currentValue = context.Load(StorageKind.ConstantBuffer, binding, Const(0), vecIndex, elemIndex); 192 193 value = context.ConditionalSelect(isCurrent, currentValue, value); 194 195 cbUseMask &= ~(1u << cbIndex); 196 } 197 198 return value; 199 } 200 } 201 202 private static Operand EmitAtomicOp( 203 EmitterContext context, 204 StorageKind storageKind, 205 AtomOp op, 206 AtomSize type, 207 Operand e0, 208 Operand e1, 209 Operand value) 210 { 211 Operand res = Const(0); 212 213 switch (op) 214 { 215 case AtomOp.Add: 216 if (type == AtomSize.S32 || type == AtomSize.U32) 217 { 218 res = context.AtomicAdd(storageKind, e0, e1, value); 219 } 220 else 221 { 222 context.TranslatorContext.GpuAccessor.Log($"Invalid reduction type: {type}."); 223 } 224 break; 225 case AtomOp.Min: 226 if (type == AtomSize.S32) 227 { 228 res = context.AtomicMinS32(storageKind, e0, e1, value); 229 } 230 else if (type == AtomSize.U32) 231 { 232 res = context.AtomicMinU32(storageKind, e0, e1, value); 233 } 234 else 235 { 236 context.TranslatorContext.GpuAccessor.Log($"Invalid reduction type: {type}."); 237 } 238 break; 239 case AtomOp.Max: 240 if (type == AtomSize.S32) 241 { 242 res = context.AtomicMaxS32(storageKind, e0, e1, value); 243 } 244 else if (type == AtomSize.U32) 245 { 246 res = context.AtomicMaxU32(storageKind, e0, e1, value); 247 } 248 else 249 { 250 context.TranslatorContext.GpuAccessor.Log($"Invalid reduction type: {type}."); 251 } 252 break; 253 case AtomOp.And: 254 if (type == AtomSize.S32 || type == AtomSize.U32) 255 { 256 res = context.AtomicAnd(storageKind, e0, e1, value); 257 } 258 else 259 { 260 context.TranslatorContext.GpuAccessor.Log($"Invalid reduction type: {type}."); 261 } 262 break; 263 case AtomOp.Or: 264 if (type == AtomSize.S32 || type == AtomSize.U32) 265 { 266 res = context.AtomicOr(storageKind, e0, e1, value); 267 } 268 else 269 { 270 context.TranslatorContext.GpuAccessor.Log($"Invalid reduction type: {type}."); 271 } 272 break; 273 case AtomOp.Xor: 274 if (type == AtomSize.S32 || type == AtomSize.U32) 275 { 276 res = context.AtomicXor(storageKind, e0, e1, value); 277 } 278 else 279 { 280 context.TranslatorContext.GpuAccessor.Log($"Invalid reduction type: {type}."); 281 } 282 break; 283 case AtomOp.Exch: 284 if (type == AtomSize.S32 || type == AtomSize.U32) 285 { 286 res = context.AtomicSwap(storageKind, e0, e1, value); 287 } 288 else 289 { 290 context.TranslatorContext.GpuAccessor.Log($"Invalid reduction type: {type}."); 291 } 292 break; 293 default: 294 context.TranslatorContext.GpuAccessor.Log($"Invalid atomic operation: {op}."); 295 break; 296 } 297 298 return res; 299 } 300 301 private static void EmitLoad( 302 EmitterContext context, 303 StorageKind storageKind, 304 LsSize2 size, 305 Operand srcA, 306 int rd, 307 int offset) 308 { 309 if (size > LsSize2.B128) 310 { 311 context.TranslatorContext.GpuAccessor.Log($"Invalid load size: {size}."); 312 return; 313 } 314 315 int id = storageKind == StorageKind.LocalMemory 316 ? context.ResourceManager.LocalMemoryId 317 : context.ResourceManager.SharedMemoryId; 318 bool isSmallInt = size < LsSize2.B32; 319 320 int count = size switch 321 { 322 LsSize2.B64 => 2, 323 LsSize2.B128 => 4, 324 _ => 1, 325 }; 326 327 Operand baseOffset = context.Copy(srcA); 328 329 for (int index = 0; index < count; index++) 330 { 331 Register dest = new(rd + index, RegisterType.Gpr); 332 333 if (dest.IsRZ) 334 { 335 break; 336 } 337 338 Operand byteOffset = context.IAdd(baseOffset, Const(offset + index * 4)); 339 Operand wordOffset = context.ShiftRightU32(byteOffset, Const(2)); // Word offset = byte offset / 4 (one word = 4 bytes). 340 Operand bitOffset = GetBitOffset(context, byteOffset); 341 Operand value = context.Load(storageKind, id, wordOffset); 342 343 if (isSmallInt) 344 { 345 value = ExtractSmallInt(context, (LsSize)size, bitOffset, value); 346 } 347 348 context.Copy(Register(dest), value); 349 } 350 } 351 352 private static void EmitLdg( 353 EmitterContext context, 354 LsSize size, 355 int ra, 356 int rd, 357 int offset, 358 bool extended) 359 { 360 int count = GetVectorCount(size); 361 StorageKind storageKind = GetStorageKind(size); 362 363 (_, Operand addrHigh) = Get40BitsAddress(context, new Register(ra, RegisterType.Gpr), extended, offset); 364 365 Operand srcA = context.Copy(new Operand(new Register(ra, RegisterType.Gpr))); 366 367 for (int index = 0; index < count; index++) 368 { 369 Register dest = new(rd + index, RegisterType.Gpr); 370 371 if (dest.IsRZ) 372 { 373 break; 374 } 375 376 Operand value = context.Load(storageKind, context.IAdd(srcA, Const(offset + index * 4)), addrHigh); 377 378 context.Copy(Register(dest), value); 379 } 380 } 381 382 private static void EmitStore( 383 EmitterContext context, 384 StorageKind storageKind, 385 LsSize2 size, 386 Operand srcA, 387 int rd, 388 int offset) 389 { 390 if (size > LsSize2.B128) 391 { 392 context.TranslatorContext.GpuAccessor.Log($"Invalid store size: {size}."); 393 return; 394 } 395 396 int id = storageKind == StorageKind.LocalMemory 397 ? context.ResourceManager.LocalMemoryId 398 : context.ResourceManager.SharedMemoryId; 399 bool isSmallInt = size < LsSize2.B32; 400 401 int count = size switch 402 { 403 LsSize2.B64 => 2, 404 LsSize2.B128 => 4, 405 _ => 1, 406 }; 407 408 Operand baseOffset = context.Copy(srcA); 409 410 for (int index = 0; index < count; index++) 411 { 412 bool isRz = rd + index >= RegisterConsts.RegisterZeroIndex; 413 414 Operand value = Register(isRz ? rd : rd + index, RegisterType.Gpr); 415 Operand byteOffset = context.IAdd(baseOffset, Const(offset + index * 4)); 416 Operand wordOffset = context.ShiftRightU32(byteOffset, Const(2)); 417 Operand bitOffset = GetBitOffset(context, byteOffset); 418 419 if (isSmallInt && storageKind == StorageKind.LocalMemory) 420 { 421 Operand word = context.Load(storageKind, id, wordOffset); 422 423 value = InsertSmallInt(context, (LsSize)size, bitOffset, word, value); 424 } 425 426 if (storageKind == StorageKind.LocalMemory) 427 { 428 context.Store(storageKind, id, wordOffset, value); 429 } 430 else if (storageKind == StorageKind.SharedMemory) 431 { 432 switch (size) 433 { 434 case LsSize2.U8: 435 case LsSize2.S8: 436 context.Store(StorageKind.SharedMemory8, id, byteOffset, value); 437 break; 438 case LsSize2.U16: 439 case LsSize2.S16: 440 context.Store(StorageKind.SharedMemory16, id, byteOffset, value); 441 break; 442 default: 443 context.Store(storageKind, id, wordOffset, value); 444 break; 445 } 446 } 447 } 448 } 449 450 private static void EmitStg( 451 EmitterContext context, 452 LsSize2 size, 453 int ra, 454 int rd, 455 int offset, 456 bool extended) 457 { 458 if (size > LsSize2.B128) 459 { 460 context.TranslatorContext.GpuAccessor.Log($"Invalid store size: {size}."); 461 return; 462 } 463 464 int count = GetVectorCount((LsSize)size); 465 StorageKind storageKind = GetStorageKind((LsSize)size); 466 467 (_, Operand addrHigh) = Get40BitsAddress(context, new Register(ra, RegisterType.Gpr), extended, offset); 468 469 Operand srcA = context.Copy(new Operand(new Register(ra, RegisterType.Gpr))); 470 471 for (int index = 0; index < count; index++) 472 { 473 bool isRz = rd + index >= RegisterConsts.RegisterZeroIndex; 474 475 Operand value = Register(isRz ? rd : rd + index, RegisterType.Gpr); 476 477 Operand addrLowOffset = context.IAdd(srcA, Const(offset + index * 4)); 478 479 context.Store(storageKind, addrLowOffset, addrHigh, value); 480 } 481 } 482 483 private static StorageKind GetStorageKind(LsSize size) 484 { 485 return size switch 486 { 487 LsSize.U8 => StorageKind.GlobalMemoryU8, 488 LsSize.S8 => StorageKind.GlobalMemoryS8, 489 LsSize.U16 => StorageKind.GlobalMemoryU16, 490 LsSize.S16 => StorageKind.GlobalMemoryS16, 491 _ => StorageKind.GlobalMemory, 492 }; 493 } 494 495 private static int GetVectorCount(LsSize size) 496 { 497 return size switch 498 { 499 LsSize.B64 => 2, 500 LsSize.B128 or LsSize.UB128 => 4, 501 _ => 1, 502 }; 503 } 504 505 private static (Operand, Operand) Get40BitsAddress( 506 EmitterContext context, 507 Register ra, 508 bool extended, 509 int offset) 510 { 511 Operand addrLow = Register(ra); 512 Operand addrHigh; 513 514 if (extended && !ra.IsRZ) 515 { 516 addrHigh = Register(ra.Index + 1, RegisterType.Gpr); 517 } 518 else 519 { 520 addrHigh = Const(0); 521 } 522 523 Operand offs = Const(offset); 524 525 addrLow = context.IAdd(addrLow, offs); 526 527 if (extended) 528 { 529 Operand carry = context.ICompareLessUnsigned(addrLow, offs); 530 531 addrHigh = context.IAdd(addrHigh, context.ConditionalSelect(carry, Const(1), Const(0))); 532 } 533 534 return (addrLow, addrHigh); 535 } 536 537 private static Operand GetBitOffset(EmitterContext context, Operand baseOffset) 538 { 539 // Note: bit offset = (baseOffset & 0b11) * 8. 540 // Addresses should be always aligned to the integer type, 541 // so we don't need to take unaligned addresses into account. 542 return context.ShiftLeft(context.BitwiseAnd(baseOffset, Const(3)), Const(3)); 543 } 544 545 private static Operand ExtractSmallInt( 546 EmitterContext context, 547 LsSize size, 548 Operand bitOffset, 549 Operand value) 550 { 551 value = context.ShiftRightU32(value, bitOffset); 552 553 switch (size) 554 { 555 case LsSize.U8: 556 value = ZeroExtendTo32(context, value, 8); 557 break; 558 case LsSize.U16: 559 value = ZeroExtendTo32(context, value, 16); 560 break; 561 case LsSize.S8: 562 value = SignExtendTo32(context, value, 8); 563 break; 564 case LsSize.S16: 565 value = SignExtendTo32(context, value, 16); 566 break; 567 } 568 569 return value; 570 } 571 572 private static Operand InsertSmallInt( 573 EmitterContext context, 574 LsSize size, 575 Operand bitOffset, 576 Operand word, 577 Operand value) 578 { 579 switch (size) 580 { 581 case LsSize.U8: 582 case LsSize.S8: 583 value = context.BitwiseAnd(value, Const(0xff)); 584 value = context.BitfieldInsert(word, value, bitOffset, Const(8)); 585 break; 586 587 case LsSize.U16: 588 case LsSize.S16: 589 value = context.BitwiseAnd(value, Const(0xffff)); 590 value = context.BitfieldInsert(word, value, bitOffset, Const(16)); 591 break; 592 } 593 594 return value; 595 } 596 } 597 }