/ tests / gpu / hello_world_nvidia.nim
hello_world_nvidia.nim
  1  # Constantine
  2  # Copyright (c) 2018-2019    Status Research & Development GmbH
  3  # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
  4  # Licensed and distributed under either of
  5  #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  6  #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  7  # at your option. This file may not be copied, modified, or distributed except according to those terms.
  8  
  9  import ../../constantine/platforms/code_generator/[llvm, nvidia, bindings/c_abi]
 10  
 11  # ############################################################
 12  #
 13  #                         NVVM
 14  #
 15  # ############################################################
 16  
 17  # https://docs.nvidia.com/cuda/libnvvm-api/index.html
 18  # https://docs.nvidia.com/pdf/libNVVM_API.pdf
 19  # https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html
 20  # https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
 21  
 22  # ⚠ NVVM IR is based on LLVM 7.0.1 IR which dates from december 2018.
 23  # There are a couple of caveats:
 24  # - LLVM 7.0.1 is usually not available in repo, making installation difficult
 25  # - There was a ABI breaking bug making the 7.0.1 and 7.1.0 versions messy (https://www.phoronix.com/news/LLVM-7.0.1-Released)
 26  # - LLVM 7.0.1 does not have LLVMBuildCall2 and relies on the deprecated LLVMBuildCall meaning
 27  #   supporting that and latest LLVM (for AMDGPU and SPIR-V backends) will likely have heavy costs
 28  # - When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
 29  #   if the LLVM IR is passed as bitcode,
 30  #   the kernel content is silently discarded, this does not happen with built-in add.
 31  #   It is unsure if it's call2 or inline ASM incompatibility that causes the issues
 32  # - When generating a add-with-carry kernel with inline ASM calls from LLVM-14,
 33  #   if the LLVM IR is passed as testual IR, the code is refused with NVVM_ERROR_INVALID_IR
 34  
 35  # Hence, using LLVM NVPTX backend instead of libNVVM is likely the sustainable way forward
 36  
 37  static: echo "[Constantine] Using library libnvvm.so"
 38  {.passl: "-L/opt/cuda/nvvm/lib64 -lnvvm".}
 39  
 40  type
 41    NvvmResult* {.size: sizeof(cint).} = enum
 42      NVVM_SUCCESS = 0
 43      NVVM_ERROR_OUT_OF_MEMORY = 1
 44      NVVM_ERROR_PROGRAM_CREATION_FAILURE = 2
 45      NVVM_ERROR_IR_VERSION_MISMATCH = 3
 46      NVVM_ERROR_INVALID_INPUT = 4
 47      NVVM_ERROR_INVALID_PROGRAM = 5
 48      NVVM_ERROR_INVALID_IR = 6
 49      NVVM_ERROR_INVALID_OPTION = 7
 50      NVVM_ERROR_NO_MODULE_IN_PROGRAM = 8
 51      NVVM_ERROR_COMPILATION = 9
 52  
 53    NvvmProgram = distinct pointer
 54  
 55  {.push noconv, importc, dynlib: "libnvvm.so".}
 56  
 57  proc nvvmGetErrorString*(r: NvvmResult): cstring
 58  proc nvvmVersion*(major, minor: var int32): NvvmResult
 59  proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult
 60  
 61  proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult
 62  proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult
 63  proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.}
 64  proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.}
 65  proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
 66  proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult
 67  proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
 68  proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult
 69  proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult
 70  proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
 71  
 72  {.pop.} # {.push noconv, importc, header: "<nvvm.h>".}
 73  
 74  # ############################################################
 75  #
 76  #                    PTX Codegen
 77  #
 78  # ############################################################
 79  
 80  template check*(status: NvvmResult) =
 81    let code = status # Assign so execution is done once only.
 82    if code != NVVM_SUCCESS:
 83      stderr.write astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code
 84      quit 1
 85  
 86  proc getNvvmLog(prog: NvvmProgram): string {.used.} =
 87    var logSize: csize_t
 88    check nvvmGetProgramLogSize(prog, logSize)
 89  
 90    if logSize > 0:
 91      result = newString(logSize)
 92      check nvvmGetProgramLog(prog, result[0].addr)
 93  
 94  proc ptxCodegenViaNvidiaNvvm(module: ModuleRef, sm: tuple[major, minor: int32]): string =
 95    ## PTX codegen via Nvidia NVVM
 96  
 97    # ######################################
 98    # LLVM -> NNVM handover
 99  
100    var prog{.noInit.}: NvvmProgram
101    check nvvmCreateProgram(prog)
102  
103    let bitcode = module.toBitcode()
104    check nvvmAddModuleToProgram(prog, bitcode, cstring module.getIdentifier())
105  
106    # ######################################
107    # GPU codegen
108  
109    check nvvmVerifyProgram(prog, 0, nil)
110  
111    let options = allocCStringArray(["-arch=compute_" & $sm.major & $sm.minor])
112    check nvvmCompileProgram(prog, 1, options)
113    deallocCStringArray(options)
114    var ptxSize: csize_t
115    check nvvmGetCompiledResultSize(prog, ptxSize)
116    result = newString(ptxSize-1) # The NNVM size includes '\0' ending char while Nim excludes it.
117    check nvvmGetCompiledResult(prog, result[0].addr)
118  
119    check nvvmDestroyProgram(prog)
120  
121  proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]): string =
122    ## PTX codegen via LLVM NVPTX
123  
124    module.verify(AbortProcessAction)
125  
126    initializeFullNVPTXTarget()
127    const triple = "nvptx64-nvidia-cuda"
128  
129    let machine = createTargetMachine(
130      target = toTarget(triple),
131      triple = triple,
132      cpu = cstring("sm_" & $sm.major & $sm.minor),
133      features = "",
134      level = CodeGenLevelAggressive,
135      reloc = RelocDefault,
136      codeModel = CodeModelDefault
137    )
138  
139    machine.emitToString(module, AssemblyFile)
140  
141  # ############################################################
142  #
143  #                    Hello world
144  #
145  # ############################################################
146  
147  echo "Nvidia JIT compiler Hello World"
148  
149  proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) =
150  
151    # ######################################
152    # Metadata
153  
154    const triple = "nvptx64-nvidia-cuda"
155    # Datalayout for NVVM IR 1.8 (CUDA 11.6)
156    const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
157  
158    # ######################################
159    # LLVM IR codegen
160  
161    module.setTarget(triple)
162    module.setDataLayout(datalayout)
163    let i128 = ctx.int128_t()
164    let void_t = ctx.void_t()
165  
166    let builder = ctx.createBuilder()
167    defer: builder.dispose()
168  
169    block:
170      let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
171      let addKernel = module.addFunction(addKernelName, addType)
172      let blck = ctx.appendBasicBlock(addKernel, "addBody")
173      builder.positionAtEnd(blck)
174      let r = addKernel.getParam(0)
175      let a = addKernel.getParam(1)
176      let b = addKernel.getParam(2)
177      let sum = builder.add(a, b, "sum")
178      builder.store(sum, r)
179      builder.retVoid()
180  
181      module.setCallableCudaKernel((addType, addKernel))
182  
183    block:
184      let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
185      let mulKernel = module.addFunction(mulKernelName, mulType)
186      let blck = ctx.appendBasicBlock(mulKernel, "mulBody")
187      builder.positionAtEnd(blck)
188      let r = mulKernel.getParam(0)
189      let a = mulKernel.getParam(1)
190      let b = mulKernel.getParam(2)
191      let prod = builder.mul(a, b, "prod")
192      builder.store(prod, r)
193      builder.retVoid()
194  
195      module.setCallableCudaKernel((mulType, mulKernel))
196  
197    module.verify(AbortProcessAction)
198  
199    block:
200      echo "================="
201      echo "LLVM IR output"
202      echo $module
203      echo "================="
204  
205  func toHex*(a: uint64): string =
206    const hexChars = "0123456789abcdef"
207    const L = 2*sizeof(uint64)
208    result = newString(L)
209    var a = a
210    for j in countdown(result.len-1, 0):
211      result[j] = hexChars[a and 0xF]
212      a = a shr 4
213  
214  func toString*(a: openArray[uint64]): string =
215    result = "0x"
216    for i in countdown(a.len-1, 0):
217      result.add toHex(a[i])
218  
219  type
220    CodegenBackend = enum
221      PTXviaNvidiaNvvm
222      PTXviaLlvmNvptx
223  
224  proc getCudaKernel(cuMod: CUmodule, fnName: string): CUfunction =
225    check cuModuleGetFunction(result, cuMod, fnName & "_public")
226  
227  proc main(backend: CodegenBackend) =
228  
229    #######################################
230    # GPU init
231    let cudaDevice = cudaDeviceInit()
232    var sm: tuple[major, minor: int32]
233    check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
234    check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)
235  
236    #######################################
237    # LLVM IR
238    let ctx = createContext()
239    let module = ctx.createModule("test_nnvm")
240  
241    let addKernelName = "addKernel"
242    let mulKernelName = "mulKernel"
243  
244    writeExampleAddMul(ctx, module, addKernelName, mulKernelName)
245  
246    #######################################
247    # PTX codegen
248    let ptx = case backend
249      of PTXviaNvidiaNvvm:
250        module.ptxCodegenViaNvidiaNVVM(sm)
251      of PTXviaLlvmNvptx:
252        module.ptxCodegenViaLlvmNvptx(sm)
253  
254    module.dispose()
255    ctx.dispose()
256  
257    block:
258      echo "================="
259      echo "PTX output"
260      echo $ptx
261      echo "================="
262  
263    #######################################
264    # GPU JIT
265    var cuCtx: CUcontext
266    var cuMod: CUmodule
267    check cuCtxCreate(cuCtx, 0, cudaDevice)
268    check cuModuleLoadData(cuMod, ptx)
269    let addKernel = cuMod.getCudaKernel(addKernelName)
270    let mulKernel = cuMod.getCudaKernel(mulKernelName)
271  
272    #######################################
273    # Kernel launch
274    var r{.noInit.}, a, b: array[2, uint64]
275  
276    a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64
277    b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64
278  
279    echo "r:   ", r.toString()
280    echo "a:   ", a.toString()
281    echo "b:   ", b.toString()
282  
283    var rGPU: CUdeviceptr
284    check cuMemAlloc(rGPU, csize_t sizeof(r))
285  
286    let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)]
287  
288    check cuLaunchKernel(
289            addKernel,
290            1, 1, 1,
291            1, 1, 1,
292            0, CUstream(nil),
293            params[0].unsafeAddr, nil)
294  
295    check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
296    echo "a+b: ", r.toString()
297  
298    check cuLaunchKernel(
299            mulKernel,
300            1, 1, 1,
301            1, 1, 1,
302            0, CUstream(nil),
303            params[0].unsafeAddr, nil)
304  
305    check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r))
306    echo "a*b: ", r.toString()
307  
308    #######################################
309    # Cleanup
310  
311    check cuMemFree(rGPU)
312    rGPU = CUdeviceptr(nil)
313  
314    check cuModuleUnload(cuMod)
315    cuMod = CUmodule(nil)
316  
317    check cuCtxDestroy(cuCtx)
318    cuCtx = CUcontext(nil)
319  
320  echo "\n\nCompilation via Nvidia NVVM\n###########################\n"
321  main(PTXviaNvidiaNvvm)
322  echo "\n\nEnd: Compilation via Nvidia NVVM\n################################"
323  
324  echo "\n\nCompilation via LLVM NVPTX\n##########################\n"
325  main(PTXviaLlvmNvptx)
326  echo "\n\nEnd: Compilation via LLVM NVPTX\n###############################"