/ src / examples / python-cuda / kernels.cu
kernels.cu
 1  // kernels.cu
 2  // CUDA kernels for Python bindings
 3  //
 4  // Compiled with clang -x cuda (NOT nvcc)
 5  
 6  #include <cuda_runtime.h>
 7  
 8  #include "kernels.cuh"
 9  
10  // =============================================================================
11  // Vector scale kernel
12  // =============================================================================
13  
14  // cppcheck-suppress unusedFunction ; called via CUDA launch syntax
15  __global__ void vector_scale_kernel(float* data, float scale, int n) {
16    int idx = blockIdx.x * blockDim.x + threadIdx.x;
17    if (idx < n) {
18      data[idx] *= scale;
19    }
20  }
21  
22  void launch_vector_scale(float* data, float scale, int n) {
23    int block_size = 256;
24    int num_blocks = (n + block_size - 1) / block_size;
25    vector_scale_kernel<<<num_blocks, block_size>>>(data, scale, n);
26    cudaDeviceSynchronize();
27  }
28  
29  // =============================================================================
30  // SAXPY kernel: y = a*x + y
31  // =============================================================================
32  
33  // cppcheck-suppress unusedFunction ; called via CUDA launch syntax
34  __global__ void saxpy_kernel(float* y, float a, const float* x, int n) {
35    int idx = blockIdx.x * blockDim.x + threadIdx.x;
36    if (idx < n) {
37      y[idx] = a * x[idx] + y[idx];
38    }
39  }
40  
41  void launch_saxpy(float* y, float a, const float* x, int n) {
42    int block_size = 256;
43    int num_blocks = (n + block_size - 1) / block_size;
44    saxpy_kernel<<<num_blocks, block_size>>>(y, a, x, n);
45    cudaDeviceSynchronize();
46  }
47  
48  // =============================================================================
49  // Dot product kernel (parallel reduction)
50  // =============================================================================
51  
52  // cppcheck-suppress unusedFunction ; called via CUDA launch syntax
53  __global__ void dot_product_kernel(float* result, const float* a, const float* b, int n) {
54    __shared__ float shared_data[256];
55  
56    int tid = threadIdx.x;
57    int idx = blockIdx.x * blockDim.x + threadIdx.x;
58  
59    // Each thread computes one product
60    shared_data[tid] = (idx < n) ? a[idx] * b[idx] : 0.0f;
61    __syncthreads();
62  
63    // Parallel reduction in shared memory
64    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
65      if (tid < stride) {
66        shared_data[tid] += shared_data[tid + stride];
67      }
68      __syncthreads();
69    }
70  
71    // Thread 0 writes result
72    if (tid == 0) {
73      atomicAdd(result, shared_data[0]);
74    }
75  }
76  
77  void launch_dot_product(float* result, const float* a, const float* b, int n) {
78    int block_size = 256;
79    int num_blocks = (n + block_size - 1) / block_size;
80    dot_product_kernel<<<num_blocks, block_size>>>(result, a, b, n);
81    cudaDeviceSynchronize();
82  }