kernels.cu
1 // kernels.cu 2 // CUDA kernels for Python bindings 3 // 4 // Compiled with clang -x cuda (NOT nvcc) 5 6 #include <cuda_runtime.h> 7 8 #include "kernels.cuh" 9 10 // ============================================================================= 11 // Vector scale kernel 12 // ============================================================================= 13 14 // cppcheck-suppress unusedFunction ; called via CUDA launch syntax 15 __global__ void vector_scale_kernel(float* data, float scale, int n) { 16 int idx = blockIdx.x * blockDim.x + threadIdx.x; 17 if (idx < n) { 18 data[idx] *= scale; 19 } 20 } 21 22 void launch_vector_scale(float* data, float scale, int n) { 23 int block_size = 256; 24 int num_blocks = (n + block_size - 1) / block_size; 25 vector_scale_kernel<<<num_blocks, block_size>>>(data, scale, n); 26 cudaDeviceSynchronize(); 27 } 28 29 // ============================================================================= 30 // SAXPY kernel: y = a*x + y 31 // ============================================================================= 32 33 // cppcheck-suppress unusedFunction ; called via CUDA launch syntax 34 __global__ void saxpy_kernel(float* y, float a, const float* x, int n) { 35 int idx = blockIdx.x * blockDim.x + threadIdx.x; 36 if (idx < n) { 37 y[idx] = a * x[idx] + y[idx]; 38 } 39 } 40 41 void launch_saxpy(float* y, float a, const float* x, int n) { 42 int block_size = 256; 43 int num_blocks = (n + block_size - 1) / block_size; 44 saxpy_kernel<<<num_blocks, block_size>>>(y, a, x, n); 45 cudaDeviceSynchronize(); 46 } 47 48 // ============================================================================= 49 // Dot product kernel (parallel reduction) 50 // ============================================================================= 51 52 // cppcheck-suppress unusedFunction ; called via CUDA launch syntax 53 __global__ void dot_product_kernel(float* result, const float* a, const float* b, int n) { 54 __shared__ float shared_data[256]; 55 56 int tid = threadIdx.x; 57 int idx = blockIdx.x * blockDim.x + threadIdx.x; 58 59 // Each thread computes one product 60 shared_data[tid] = (idx < n) ? a[idx] * b[idx] : 0.0f; 61 __syncthreads(); 62 63 // Parallel reduction in shared memory 64 for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { 65 if (tid < stride) { 66 shared_data[tid] += shared_data[tid + stride]; 67 } 68 __syncthreads(); 69 } 70 71 // Thread 0 writes result 72 if (tid == 0) { 73 atomicAdd(result, shared_data[0]); 74 } 75 } 76 77 void launch_dot_product(float* result, const float* a, const float* b, int n) { 78 int block_size = 256; 79 int num_blocks = (n + block_size - 1) / block_size; 80 dot_product_kernel<<<num_blocks, block_size>>>(result, a, b, n); 81 cudaDeviceSynchronize(); 82 }