kernels.cu
1 // kernels.cu 2 // CUDA kernels for Python bindings 3 // 4 // Compiled with clang -x cuda (NOT nvcc) 5 6 #include "kernels.cuh" 7 #include <cuda_runtime.h> 8 9 // ============================================================================= 10 // Vector scale kernel 11 // ============================================================================= 12 13 // cppcheck-suppress unusedFunction ; called via CUDA launch syntax 14 __global__ void vector_scale_kernel(float *data, float scale, int n) { 15 int idx = blockIdx.x * blockDim.x + threadIdx.x; 16 if (idx < n) { 17 data[idx] *= scale; 18 } 19 } 20 21 void launch_vector_scale(float *data, float scale, int n) { 22 int block_size = 256; 23 int num_blocks = (n + block_size - 1) / block_size; 24 vector_scale_kernel<<<num_blocks, block_size>>>(data, scale, n); 25 cudaDeviceSynchronize(); 26 } 27 28 // ============================================================================= 29 // SAXPY kernel: y = a*x + y 30 // ============================================================================= 31 32 // cppcheck-suppress unusedFunction ; called via CUDA launch syntax 33 __global__ void saxpy_kernel(float *y, float a, const float *x, int n) { 34 int idx = blockIdx.x * blockDim.x + threadIdx.x; 35 if (idx < n) { 36 y[idx] = a * x[idx] + y[idx]; 37 } 38 } 39 40 void launch_saxpy(float *y, float a, const float *x, int n) { 41 int block_size = 256; 42 int num_blocks = (n + block_size - 1) / block_size; 43 saxpy_kernel<<<num_blocks, block_size>>>(y, a, x, n); 44 cudaDeviceSynchronize(); 45 } 46 47 // ============================================================================= 48 // Dot product kernel (parallel reduction) 49 // ============================================================================= 50 51 // cppcheck-suppress unusedFunction ; called via CUDA launch syntax 52 __global__ void dot_product_kernel(float *result, const float *a, 53 const float *b, int n) { 54 __shared__ float shared_data[256]; 55 56 int tid = threadIdx.x; 57 int idx = blockIdx.x * blockDim.x + threadIdx.x; 58 59 // Each thread computes one product 60 shared_data[tid] = (idx < n) ? a[idx] * b[idx] : 0.0f; 61 __syncthreads(); 62 63 // Parallel reduction in shared memory 64 for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { 65 if (tid < stride) { 66 shared_data[tid] += shared_data[tid + stride]; 67 } 68 __syncthreads(); 69 } 70 71 // Thread 0 writes result 72 if (tid == 0) { 73 atomicAdd(result, shared_data[0]); 74 } 75 } 76 77 void launch_dot_product(float *result, const float *a, const float *b, int n) { 78 int block_size = 256; 79 int num_blocks = (n + block_size - 1) / block_size; 80 dot_product_kernel<<<num_blocks, block_size>>>(result, a, b, n); 81 cudaDeviceSynchronize(); 82 }