/ src / examples / python-cuda / kernels.cu
kernels.cu
 1  // kernels.cu
 2  // CUDA kernels for Python bindings
 3  //
 4  // Compiled with clang -x cuda (NOT nvcc)
 5  
 6  #include "kernels.cuh"
 7  #include <cuda_runtime.h>
 8  
 9  // =============================================================================
10  // Vector scale kernel
11  // =============================================================================
12  
13  // cppcheck-suppress unusedFunction ; called via CUDA launch syntax
14  __global__ void vector_scale_kernel(float *data, float scale, int n) {
15    int idx = blockIdx.x * blockDim.x + threadIdx.x;
16    if (idx < n) {
17      data[idx] *= scale;
18    }
19  }
20  
21  void launch_vector_scale(float *data, float scale, int n) {
22    int block_size = 256;
23    int num_blocks = (n + block_size - 1) / block_size;
24    vector_scale_kernel<<<num_blocks, block_size>>>(data, scale, n);
25    cudaDeviceSynchronize();
26  }
27  
28  // =============================================================================
29  // SAXPY kernel: y = a*x + y
30  // =============================================================================
31  
32  // cppcheck-suppress unusedFunction ; called via CUDA launch syntax
33  __global__ void saxpy_kernel(float *y, float a, const float *x, int n) {
34    int idx = blockIdx.x * blockDim.x + threadIdx.x;
35    if (idx < n) {
36      y[idx] = a * x[idx] + y[idx];
37    }
38  }
39  
40  void launch_saxpy(float *y, float a, const float *x, int n) {
41    int block_size = 256;
42    int num_blocks = (n + block_size - 1) / block_size;
43    saxpy_kernel<<<num_blocks, block_size>>>(y, a, x, n);
44    cudaDeviceSynchronize();
45  }
46  
47  // =============================================================================
48  // Dot product kernel (parallel reduction)
49  // =============================================================================
50  
51  // cppcheck-suppress unusedFunction ; called via CUDA launch syntax
52  __global__ void dot_product_kernel(float *result, const float *a,
53                                     const float *b, int n) {
54    __shared__ float shared_data[256];
55  
56    int tid = threadIdx.x;
57    int idx = blockIdx.x * blockDim.x + threadIdx.x;
58  
59    // Each thread computes one product
60    shared_data[tid] = (idx < n) ? a[idx] * b[idx] : 0.0f;
61    __syncthreads();
62  
63    // Parallel reduction in shared memory
64    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
65      if (tid < stride) {
66        shared_data[tid] += shared_data[tid + stride];
67      }
68      __syncthreads();
69    }
70  
71    // Thread 0 writes result
72    if (tid == 0) {
73      atomicAdd(result, shared_data[0]);
74    }
75  }
76  
77  void launch_dot_product(float *result, const float *a, const float *b, int n) {
78    int block_size = 256;
79    int num_blocks = (n + block_size - 1) / block_size;
80    dot_product_kernel<<<num_blocks, block_size>>>(result, a, b, n);
81    cudaDeviceSynchronize();
82  }