hello_world_nvidia.nim
1 # Constantine 2 # Copyright (c) 2018-2019 Status Research & Development GmbH 3 # Copyright (c) 2020-Present Mamy André-Ratsimbazafy 4 # Licensed and distributed under either of 5 # * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). 6 # * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). 7 # at your option. This file may not be copied, modified, or distributed except according to those terms. 8 9 import ../../constantine/platforms/code_generator/[llvm, nvidia, bindings/c_abi] 10 11 # ############################################################ 12 # 13 # NVVM 14 # 15 # ############################################################ 16 17 # https://docs.nvidia.com/cuda/libnvvm-api/index.html 18 # https://docs.nvidia.com/pdf/libNVVM_API.pdf 19 # https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html 20 # https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf 21 22 # ⚠ NVVM IR is based on LLVM 7.0.1 IR which dates from december 2018. 23 # There are a couple of caveats: 24 # - LLVM 7.0.1 is usually not available in repo, making installation difficult 25 # - There was a ABI breaking bug making the 7.0.1 and 7.1.0 versions messy (https://www.phoronix.com/news/LLVM-7.0.1-Released) 26 # - LLVM 7.0.1 does not have LLVMBuildCall2 and relies on the deprecated LLVMBuildCall meaning 27 # supporting that and latest LLVM (for AMDGPU and SPIR-V backends) will likely have heavy costs 28 # - When generating a add-with-carry kernel with inline ASM calls from LLVM-14, 29 # if the LLVM IR is passed as bitcode, 30 # the kernel content is silently discarded, this does not happen with built-in add. 31 # It is unsure if it's call2 or inline ASM incompatibility that causes the issues 32 # - When generating a add-with-carry kernel with inline ASM calls from LLVM-14, 33 # if the LLVM IR is passed as testual IR, the code is refused with NVVM_ERROR_INVALID_IR 34 35 # Hence, using LLVM NVPTX backend instead of libNVVM is likely the sustainable way forward 36 37 static: echo "[Constantine] Using library libnvvm.so" 38 {.passl: "-L/opt/cuda/nvvm/lib64 -lnvvm".} 39 40 type 41 NvvmResult* {.size: sizeof(cint).} = enum 42 NVVM_SUCCESS = 0 43 NVVM_ERROR_OUT_OF_MEMORY = 1 44 NVVM_ERROR_PROGRAM_CREATION_FAILURE = 2 45 NVVM_ERROR_IR_VERSION_MISMATCH = 3 46 NVVM_ERROR_INVALID_INPUT = 4 47 NVVM_ERROR_INVALID_PROGRAM = 5 48 NVVM_ERROR_INVALID_IR = 6 49 NVVM_ERROR_INVALID_OPTION = 7 50 NVVM_ERROR_NO_MODULE_IN_PROGRAM = 8 51 NVVM_ERROR_COMPILATION = 9 52 53 NvvmProgram = distinct pointer 54 55 {.push noconv, importc, dynlib: "libnvvm.so".} 56 57 proc nvvmGetErrorString*(r: NvvmResult): cstring 58 proc nvvmVersion*(major, minor: var int32): NvvmResult 59 proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult 60 61 proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult 62 proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult 63 proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.} 64 proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult {.wrapOpenArrayLenType: csize_t.} 65 proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult 66 proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult 67 proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult 68 proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult 69 proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult 70 proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult 71 72 {.pop.} # {.push noconv, importc, header: "<nvvm.h>".} 73 74 # ############################################################ 75 # 76 # PTX Codegen 77 # 78 # ############################################################ 79 80 template check*(status: NvvmResult) = 81 let code = status # Assign so execution is done once only. 82 if code != NVVM_SUCCESS: 83 stderr.write astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code 84 quit 1 85 86 proc getNvvmLog(prog: NvvmProgram): string {.used.} = 87 var logSize: csize_t 88 check nvvmGetProgramLogSize(prog, logSize) 89 90 if logSize > 0: 91 result = newString(logSize) 92 check nvvmGetProgramLog(prog, result[0].addr) 93 94 proc ptxCodegenViaNvidiaNvvm(module: ModuleRef, sm: tuple[major, minor: int32]): string = 95 ## PTX codegen via Nvidia NVVM 96 97 # ###################################### 98 # LLVM -> NNVM handover 99 100 var prog{.noInit.}: NvvmProgram 101 check nvvmCreateProgram(prog) 102 103 let bitcode = module.toBitcode() 104 check nvvmAddModuleToProgram(prog, bitcode, cstring module.getIdentifier()) 105 106 # ###################################### 107 # GPU codegen 108 109 check nvvmVerifyProgram(prog, 0, nil) 110 111 let options = allocCStringArray(["-arch=compute_" & $sm.major & $sm.minor]) 112 check nvvmCompileProgram(prog, 1, options) 113 deallocCStringArray(options) 114 var ptxSize: csize_t 115 check nvvmGetCompiledResultSize(prog, ptxSize) 116 result = newString(ptxSize-1) # The NNVM size includes '\0' ending char while Nim excludes it. 117 check nvvmGetCompiledResult(prog, result[0].addr) 118 119 check nvvmDestroyProgram(prog) 120 121 proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]): string = 122 ## PTX codegen via LLVM NVPTX 123 124 module.verify(AbortProcessAction) 125 126 initializeFullNVPTXTarget() 127 const triple = "nvptx64-nvidia-cuda" 128 129 let machine = createTargetMachine( 130 target = toTarget(triple), 131 triple = triple, 132 cpu = cstring("sm_" & $sm.major & $sm.minor), 133 features = "", 134 level = CodeGenLevelAggressive, 135 reloc = RelocDefault, 136 codeModel = CodeModelDefault 137 ) 138 139 machine.emitToString(module, AssemblyFile) 140 141 # ############################################################ 142 # 143 # Hello world 144 # 145 # ############################################################ 146 147 echo "Nvidia JIT compiler Hello World" 148 149 proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) = 150 151 # ###################################### 152 # Metadata 153 154 const triple = "nvptx64-nvidia-cuda" 155 # Datalayout for NVVM IR 1.8 (CUDA 11.6) 156 const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 157 158 # ###################################### 159 # LLVM IR codegen 160 161 module.setTarget(triple) 162 module.setDataLayout(datalayout) 163 let i128 = ctx.int128_t() 164 let void_t = ctx.void_t() 165 166 let builder = ctx.createBuilder() 167 defer: builder.dispose() 168 169 block: 170 let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) 171 let addKernel = module.addFunction(addKernelName, addType) 172 let blck = ctx.appendBasicBlock(addKernel, "addBody") 173 builder.positionAtEnd(blck) 174 let r = addKernel.getParam(0) 175 let a = addKernel.getParam(1) 176 let b = addKernel.getParam(2) 177 let sum = builder.add(a, b, "sum") 178 builder.store(sum, r) 179 builder.retVoid() 180 181 module.setCallableCudaKernel((addType, addKernel)) 182 183 block: 184 let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) 185 let mulKernel = module.addFunction(mulKernelName, mulType) 186 let blck = ctx.appendBasicBlock(mulKernel, "mulBody") 187 builder.positionAtEnd(blck) 188 let r = mulKernel.getParam(0) 189 let a = mulKernel.getParam(1) 190 let b = mulKernel.getParam(2) 191 let prod = builder.mul(a, b, "prod") 192 builder.store(prod, r) 193 builder.retVoid() 194 195 module.setCallableCudaKernel((mulType, mulKernel)) 196 197 module.verify(AbortProcessAction) 198 199 block: 200 echo "=================" 201 echo "LLVM IR output" 202 echo $module 203 echo "=================" 204 205 func toHex*(a: uint64): string = 206 const hexChars = "0123456789abcdef" 207 const L = 2*sizeof(uint64) 208 result = newString(L) 209 var a = a 210 for j in countdown(result.len-1, 0): 211 result[j] = hexChars[a and 0xF] 212 a = a shr 4 213 214 func toString*(a: openArray[uint64]): string = 215 result = "0x" 216 for i in countdown(a.len-1, 0): 217 result.add toHex(a[i]) 218 219 type 220 CodegenBackend = enum 221 PTXviaNvidiaNvvm 222 PTXviaLlvmNvptx 223 224 proc getCudaKernel(cuMod: CUmodule, fnName: string): CUfunction = 225 check cuModuleGetFunction(result, cuMod, fnName & "_public") 226 227 proc main(backend: CodegenBackend) = 228 229 ####################################### 230 # GPU init 231 let cudaDevice = cudaDeviceInit() 232 var sm: tuple[major, minor: int32] 233 check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice) 234 check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice) 235 236 ####################################### 237 # LLVM IR 238 let ctx = createContext() 239 let module = ctx.createModule("test_nnvm") 240 241 let addKernelName = "addKernel" 242 let mulKernelName = "mulKernel" 243 244 writeExampleAddMul(ctx, module, addKernelName, mulKernelName) 245 246 ####################################### 247 # PTX codegen 248 let ptx = case backend 249 of PTXviaNvidiaNvvm: 250 module.ptxCodegenViaNvidiaNVVM(sm) 251 of PTXviaLlvmNvptx: 252 module.ptxCodegenViaLlvmNvptx(sm) 253 254 module.dispose() 255 ctx.dispose() 256 257 block: 258 echo "=================" 259 echo "PTX output" 260 echo $ptx 261 echo "=================" 262 263 ####################################### 264 # GPU JIT 265 var cuCtx: CUcontext 266 var cuMod: CUmodule 267 check cuCtxCreate(cuCtx, 0, cudaDevice) 268 check cuModuleLoadData(cuMod, ptx) 269 let addKernel = cuMod.getCudaKernel(addKernelName) 270 let mulKernel = cuMod.getCudaKernel(mulKernelName) 271 272 ####################################### 273 # Kernel launch 274 var r{.noInit.}, a, b: array[2, uint64] 275 276 a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64 277 b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64 278 279 echo "r: ", r.toString() 280 echo "a: ", a.toString() 281 echo "b: ", b.toString() 282 283 var rGPU: CUdeviceptr 284 check cuMemAlloc(rGPU, csize_t sizeof(r)) 285 286 let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)] 287 288 check cuLaunchKernel( 289 addKernel, 290 1, 1, 1, 291 1, 1, 1, 292 0, CUstream(nil), 293 params[0].unsafeAddr, nil) 294 295 check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r)) 296 echo "a+b: ", r.toString() 297 298 check cuLaunchKernel( 299 mulKernel, 300 1, 1, 1, 301 1, 1, 1, 302 0, CUstream(nil), 303 params[0].unsafeAddr, nil) 304 305 check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r)) 306 echo "a*b: ", r.toString() 307 308 ####################################### 309 # Cleanup 310 311 check cuMemFree(rGPU) 312 rGPU = CUdeviceptr(nil) 313 314 check cuModuleUnload(cuMod) 315 cuMod = CUmodule(nil) 316 317 check cuCtxDestroy(cuCtx) 318 cuCtx = CUcontext(nil) 319 320 echo "\n\nCompilation via Nvidia NVVM\n###########################\n" 321 main(PTXviaNvidiaNvvm) 322 echo "\n\nEnd: Compilation via Nvidia NVVM\n################################" 323 324 echo "\n\nCompilation via LLVM NVPTX\n##########################\n" 325 main(PTXviaLlvmNvptx) 326 echo "\n\nEnd: Compilation via LLVM NVPTX\n###############################"