/ src / examples / python-cuda / bindings.cpp
bindings.cpp
  1  // bindings.cpp
  2  // pybind11 bindings for CUDA kernels
  3  //
  4  // Demonstrates:
  5  //   - numpy array to device memory transfer
  6  //   - Kernel launch from pybind11
  7  //   - Device memory management
  8  
  9  #include <cuda_runtime.h>
 10  #include <pybind11/numpy.h>
 11  #include <pybind11/pybind11.h>
 12  
 13  // From kernels.cu
 14  void launch_vector_scale(float *data, float scale, int n);
 15  void launch_saxpy(float *y, float a, const float *x, int n);
 16  void launch_dot_product(float *result, const float *a, const float *b, int n);
 17  
 18  namespace py = pybind11;
 19  
 20  // =============================================================================
 21  // Scale array elements on GPU
 22  // =============================================================================
 23  
 24  py::array_t<float> scale_array(py::array_t<float> input, float scale) {
 25    py::buffer_info buf = input.request();
 26  
 27    if (buf.ndim != 1) {
 28      throw std::runtime_error("Input must be 1-dimensional");
 29    }
 30  
 31    int n = static_cast<int>(buf.size);
 32    float *ptr = static_cast<float *>(buf.ptr);
 33  
 34    // Allocate device memory
 35    float *d_data;
 36    cudaMalloc(&d_data, n * sizeof(float));
 37    cudaMemcpy(d_data, ptr, n * sizeof(float), cudaMemcpyHostToDevice);
 38  
 39    // Launch kernel
 40    launch_vector_scale(d_data, scale, n);
 41  
 42    // Copy result back
 43    auto result = py::array_t<float>(n);
 44    py::buffer_info result_buf = result.request();
 45    cudaMemcpy(result_buf.ptr, d_data, n * sizeof(float), cudaMemcpyDeviceToHost);
 46  
 47    cudaFree(d_data);
 48  
 49    return result;
 50  }
 51  
 52  // =============================================================================
 53  // SAXPY: y = a*x + y on GPU
 54  // =============================================================================
 55  
 56  py::array_t<float> saxpy(py::array_t<float> y, float a, py::array_t<float> x) {
 57    py::buffer_info y_buf = y.request();
 58    py::buffer_info x_buf = x.request();
 59  
 60    if (y_buf.ndim != 1 || x_buf.ndim != 1) {
 61      throw std::runtime_error("Arrays must be 1-dimensional");
 62    }
 63    if (y_buf.size != x_buf.size) {
 64      throw std::runtime_error("Arrays must have same size");
 65    }
 66  
 67    int n = static_cast<int>(y_buf.size);
 68  
 69    float *d_x, *d_y;
 70    cudaMalloc(&d_x, n * sizeof(float));
 71    cudaMalloc(&d_y, n * sizeof(float));
 72  
 73    cudaMemcpy(d_x, x_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice);
 74    cudaMemcpy(d_y, y_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice);
 75  
 76    launch_saxpy(d_y, a, d_x, n);
 77  
 78    auto result = py::array_t<float>(n);
 79    cudaMemcpy(result.request().ptr, d_y, n * sizeof(float),
 80               cudaMemcpyDeviceToHost);
 81  
 82    cudaFree(d_x);
 83    cudaFree(d_y);
 84  
 85    return result;
 86  }
 87  
 88  // =============================================================================
 89  // Dot product on GPU
 90  // =============================================================================
 91  
 92  float dot(py::array_t<float> a, py::array_t<float> b) {
 93    py::buffer_info a_buf = a.request();
 94    py::buffer_info b_buf = b.request();
 95  
 96    if (a_buf.ndim != 1 || b_buf.ndim != 1) {
 97      throw std::runtime_error("Arrays must be 1-dimensional");
 98    }
 99    if (a_buf.size != b_buf.size) {
100      throw std::runtime_error("Arrays must have same size");
101    }
102  
103    int n = static_cast<int>(a_buf.size);
104  
105    float *d_a, *d_b, *d_result;
106    cudaMalloc(&d_a, n * sizeof(float));
107    cudaMalloc(&d_b, n * sizeof(float));
108    cudaMalloc(&d_result, sizeof(float));
109    cudaMemset(d_result, 0, sizeof(float));
110  
111    cudaMemcpy(d_a, a_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice);
112    cudaMemcpy(d_b, b_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice);
113  
114    launch_dot_product(d_result, d_a, d_b, n);
115  
116    float result;
117    cudaMemcpy(&result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
118  
119    cudaFree(d_a);
120    cudaFree(d_b);
121    cudaFree(d_result);
122  
123    return result;
124  }
125  
126  // =============================================================================
127  // Check if CUDA is available
128  // =============================================================================
129  
130  bool nv_available() {
131    int device_count = 0;
132    cudaError_t error = cudaGetDeviceCount(&device_count);
133    return (error == cudaSuccess && device_count > 0);
134  }
135  
136  std::string nv_device_name() {
137    int device_count = 0;
138    if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0) {
139      return "No CUDA devices available";
140    }
141    cudaDeviceProp props;
142    cudaGetDeviceProperties(&props, 0);
143    return std::string(props.name);
144  }
145  
146  // =============================================================================
147  // Module definition
148  // =============================================================================
149  
150  PYBIND11_MODULE(gpu_module, m) {
151    m.doc() = "GPU-accelerated operations via CUDA";
152  
153    m.def("nv_available", &nv_available, "Check if CUDA is available");
154  
155    m.def("nv_device_name", &nv_device_name, "Get the name of the CUDA device");
156  
157    m.def("scale_array", &scale_array, "Scale array elements on GPU",
158          py::arg("input"), py::arg("scale"));
159  
160    m.def("saxpy", &saxpy, "SAXPY: y = a*x + y on GPU", py::arg("y"),
161          py::arg("a"), py::arg("x"));
162  
163    m.def("dot", &dot, "Dot product on GPU", py::arg("a"), py::arg("b"));
164  }