bindings.cpp
1 // bindings.cpp 2 // pybind11 bindings for CUDA kernels 3 // 4 // Demonstrates: 5 // - numpy array to device memory transfer 6 // - Kernel launch from pybind11 7 // - Device memory management 8 9 #include <cuda_runtime.h> 10 #include <pybind11/numpy.h> 11 #include <pybind11/pybind11.h> 12 13 // From kernels.cu 14 void launch_vector_scale(float *data, float scale, int n); 15 void launch_saxpy(float *y, float a, const float *x, int n); 16 void launch_dot_product(float *result, const float *a, const float *b, int n); 17 18 namespace py = pybind11; 19 20 // ============================================================================= 21 // Scale array elements on GPU 22 // ============================================================================= 23 24 py::array_t<float> scale_array(py::array_t<float> input, float scale) { 25 py::buffer_info buf = input.request(); 26 27 if (buf.ndim != 1) { 28 throw std::runtime_error("Input must be 1-dimensional"); 29 } 30 31 int n = static_cast<int>(buf.size); 32 float *ptr = static_cast<float *>(buf.ptr); 33 34 // Allocate device memory 35 float *d_data; 36 cudaMalloc(&d_data, n * sizeof(float)); 37 cudaMemcpy(d_data, ptr, n * sizeof(float), cudaMemcpyHostToDevice); 38 39 // Launch kernel 40 launch_vector_scale(d_data, scale, n); 41 42 // Copy result back 43 auto result = py::array_t<float>(n); 44 py::buffer_info result_buf = result.request(); 45 cudaMemcpy(result_buf.ptr, d_data, n * sizeof(float), cudaMemcpyDeviceToHost); 46 47 cudaFree(d_data); 48 49 return result; 50 } 51 52 // ============================================================================= 53 // SAXPY: y = a*x + y on GPU 54 // ============================================================================= 55 56 py::array_t<float> saxpy(py::array_t<float> y, float a, py::array_t<float> x) { 57 py::buffer_info y_buf = y.request(); 58 py::buffer_info x_buf = x.request(); 59 60 if (y_buf.ndim != 1 || x_buf.ndim != 1) { 61 throw std::runtime_error("Arrays must be 1-dimensional"); 62 } 63 if (y_buf.size != x_buf.size) { 64 throw std::runtime_error("Arrays must have same size"); 65 } 66 67 int n = static_cast<int>(y_buf.size); 68 69 float *d_x, *d_y; 70 cudaMalloc(&d_x, n * sizeof(float)); 71 cudaMalloc(&d_y, n * sizeof(float)); 72 73 cudaMemcpy(d_x, x_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice); 74 cudaMemcpy(d_y, y_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice); 75 76 launch_saxpy(d_y, a, d_x, n); 77 78 auto result = py::array_t<float>(n); 79 cudaMemcpy(result.request().ptr, d_y, n * sizeof(float), 80 cudaMemcpyDeviceToHost); 81 82 cudaFree(d_x); 83 cudaFree(d_y); 84 85 return result; 86 } 87 88 // ============================================================================= 89 // Dot product on GPU 90 // ============================================================================= 91 92 float dot(py::array_t<float> a, py::array_t<float> b) { 93 py::buffer_info a_buf = a.request(); 94 py::buffer_info b_buf = b.request(); 95 96 if (a_buf.ndim != 1 || b_buf.ndim != 1) { 97 throw std::runtime_error("Arrays must be 1-dimensional"); 98 } 99 if (a_buf.size != b_buf.size) { 100 throw std::runtime_error("Arrays must have same size"); 101 } 102 103 int n = static_cast<int>(a_buf.size); 104 105 float *d_a, *d_b, *d_result; 106 cudaMalloc(&d_a, n * sizeof(float)); 107 cudaMalloc(&d_b, n * sizeof(float)); 108 cudaMalloc(&d_result, sizeof(float)); 109 cudaMemset(d_result, 0, sizeof(float)); 110 111 cudaMemcpy(d_a, a_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice); 112 cudaMemcpy(d_b, b_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice); 113 114 launch_dot_product(d_result, d_a, d_b, n); 115 116 float result; 117 cudaMemcpy(&result, d_result, sizeof(float), cudaMemcpyDeviceToHost); 118 119 cudaFree(d_a); 120 cudaFree(d_b); 121 cudaFree(d_result); 122 123 return result; 124 } 125 126 // ============================================================================= 127 // Check if CUDA is available 128 // ============================================================================= 129 130 bool nv_available() { 131 int device_count = 0; 132 cudaError_t error = cudaGetDeviceCount(&device_count); 133 return (error == cudaSuccess && device_count > 0); 134 } 135 136 std::string nv_device_name() { 137 int device_count = 0; 138 if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0) { 139 return "No CUDA devices available"; 140 } 141 cudaDeviceProp props; 142 cudaGetDeviceProperties(&props, 0); 143 return std::string(props.name); 144 } 145 146 // ============================================================================= 147 // Module definition 148 // ============================================================================= 149 150 PYBIND11_MODULE(gpu_module, m) { 151 m.doc() = "GPU-accelerated operations via CUDA"; 152 153 m.def("nv_available", &nv_available, "Check if CUDA is available"); 154 155 m.def("nv_device_name", &nv_device_name, "Get the name of the CUDA device"); 156 157 m.def("scale_array", &scale_array, "Scale array elements on GPU", 158 py::arg("input"), py::arg("scale")); 159 160 m.def("saxpy", &saxpy, "SAXPY: y = a*x + y on GPU", py::arg("y"), 161 py::arg("a"), py::arg("x")); 162 163 m.def("dot", &dot, "Dot product on GPU", py::arg("a"), py::arg("b")); 164 }