/ src / examples / python-cuda / bindings.cpp
bindings.cpp
  1  // bindings.cpp
  2  // pybind11 bindings for CUDA kernels
  3  //
  4  // Demonstrates:
  5  //   - numpy array to device memory transfer
  6  //   - Kernel launch from pybind11
  7  //   - Device memory management
  8  
  9  #include <cuda_runtime.h>
 10  
 11  #include <pybind11/numpy.h>
 12  #include <pybind11/pybind11.h>
 13  
 14  // From kernels.cu
 15  void launch_vector_scale(float* data, float scale, int n);
 16  void launch_saxpy(float* y, float a, const float* x, int n);
 17  void launch_dot_product(float* result, const float* a, const float* b, int n);
 18  
 19  namespace py = pybind11;
 20  
 21  // =============================================================================
 22  // Scale array elements on GPU
 23  // =============================================================================
 24  
 25  py::array_t<float> scale_array(py::array_t<float> input, float scale) {
 26    py::buffer_info buf = input.request();
 27  
 28    if (buf.ndim != 1) {
 29      throw std::runtime_error("Input must be 1-dimensional");
 30    }
 31  
 32    int n = static_cast<int>(buf.size);
 33    float* ptr = static_cast<float*>(buf.ptr);
 34  
 35    // Allocate device memory
 36    float* d_data;
 37    cudaMalloc(&d_data, n * sizeof(float));
 38    cudaMemcpy(d_data, ptr, n * sizeof(float), cudaMemcpyHostToDevice);
 39  
 40    // Launch kernel
 41    launch_vector_scale(d_data, scale, n);
 42  
 43    // Copy result back
 44    auto result = py::array_t<float>(n);
 45    py::buffer_info result_buf = result.request();
 46    cudaMemcpy(result_buf.ptr, d_data, n * sizeof(float), cudaMemcpyDeviceToHost);
 47  
 48    cudaFree(d_data);
 49  
 50    return result;
 51  }
 52  
 53  // =============================================================================
 54  // SAXPY: y = a*x + y on GPU
 55  // =============================================================================
 56  
 57  py::array_t<float> saxpy(py::array_t<float> y, float a, py::array_t<float> x) {
 58    py::buffer_info y_buf = y.request();
 59    py::buffer_info x_buf = x.request();
 60  
 61    if (y_buf.ndim != 1 || x_buf.ndim != 1) {
 62      throw std::runtime_error("Arrays must be 1-dimensional");
 63    }
 64    if (y_buf.size != x_buf.size) {
 65      throw std::runtime_error("Arrays must have same size");
 66    }
 67  
 68    int n = static_cast<int>(y_buf.size);
 69  
 70    float *d_x, *d_y;
 71    cudaMalloc(&d_x, n * sizeof(float));
 72    cudaMalloc(&d_y, n * sizeof(float));
 73  
 74    cudaMemcpy(d_x, x_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice);
 75    cudaMemcpy(d_y, y_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice);
 76  
 77    launch_saxpy(d_y, a, d_x, n);
 78  
 79    auto result = py::array_t<float>(n);
 80    cudaMemcpy(result.request().ptr, d_y, n * sizeof(float), cudaMemcpyDeviceToHost);
 81  
 82    cudaFree(d_x);
 83    cudaFree(d_y);
 84  
 85    return result;
 86  }
 87  
 88  // =============================================================================
 89  // Dot product on GPU
 90  // =============================================================================
 91  
 92  float dot(py::array_t<float> a, py::array_t<float> b) {
 93    py::buffer_info a_buf = a.request();
 94    py::buffer_info b_buf = b.request();
 95  
 96    if (a_buf.ndim != 1 || b_buf.ndim != 1) {
 97      throw std::runtime_error("Arrays must be 1-dimensional");
 98    }
 99    if (a_buf.size != b_buf.size) {
100      throw std::runtime_error("Arrays must have same size");
101    }
102  
103    int n = static_cast<int>(a_buf.size);
104  
105    float *d_a, *d_b, *d_result;
106    cudaMalloc(&d_a, n * sizeof(float));
107    cudaMalloc(&d_b, n * sizeof(float));
108    cudaMalloc(&d_result, sizeof(float));
109    cudaMemset(d_result, 0, sizeof(float));
110  
111    cudaMemcpy(d_a, a_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice);
112    cudaMemcpy(d_b, b_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice);
113  
114    launch_dot_product(d_result, d_a, d_b, n);
115  
116    float result;
117    cudaMemcpy(&result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
118  
119    cudaFree(d_a);
120    cudaFree(d_b);
121    cudaFree(d_result);
122  
123    return result;
124  }
125  
126  // =============================================================================
127  // Check if CUDA is available
128  // =============================================================================
129  
130  bool nv_available() {
131    int device_count = 0;
132    cudaError_t error = cudaGetDeviceCount(&device_count);
133    return (error == cudaSuccess && device_count > 0);
134  }
135  
136  std::string nv_device_name() {
137    int device_count = 0;
138    if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0) {
139      return "No CUDA devices available";
140    }
141    cudaDeviceProp props;
142    cudaGetDeviceProperties(&props, 0);
143    return std::string(props.name);
144  }
145  
146  // =============================================================================
147  // Module definition
148  // =============================================================================
149  
150  PYBIND11_MODULE(gpu_module, m) {
151    m.doc() = "GPU-accelerated operations via CUDA";
152  
153    m.def("nv_available", &nv_available, "Check if CUDA is available");
154  
155    m.def("nv_device_name", &nv_device_name, "Get the name of the CUDA device");
156  
157    m.def("scale_array", &scale_array, "Scale array elements on GPU", py::arg("input"),
158          py::arg("scale"));
159  
160    m.def("saxpy", &saxpy, "SAXPY: y = a*x + y on GPU", py::arg("y"), py::arg("a"), py::arg("x"));
161  
162    m.def("dot", &dot, "Dot product on GPU", py::arg("a"), py::arg("b"));
163  }