bindings.cpp
1 // bindings.cpp 2 // pybind11 bindings for CUDA kernels 3 // 4 // Demonstrates: 5 // - numpy array to device memory transfer 6 // - Kernel launch from pybind11 7 // - Device memory management 8 9 #include <cuda_runtime.h> 10 11 #include <pybind11/numpy.h> 12 #include <pybind11/pybind11.h> 13 14 // From kernels.cu 15 void launch_vector_scale(float* data, float scale, int n); 16 void launch_saxpy(float* y, float a, const float* x, int n); 17 void launch_dot_product(float* result, const float* a, const float* b, int n); 18 19 namespace py = pybind11; 20 21 // ============================================================================= 22 // Scale array elements on GPU 23 // ============================================================================= 24 25 py::array_t<float> scale_array(py::array_t<float> input, float scale) { 26 py::buffer_info buf = input.request(); 27 28 if (buf.ndim != 1) { 29 throw std::runtime_error("Input must be 1-dimensional"); 30 } 31 32 int n = static_cast<int>(buf.size); 33 float* ptr = static_cast<float*>(buf.ptr); 34 35 // Allocate device memory 36 float* d_data; 37 cudaMalloc(&d_data, n * sizeof(float)); 38 cudaMemcpy(d_data, ptr, n * sizeof(float), cudaMemcpyHostToDevice); 39 40 // Launch kernel 41 launch_vector_scale(d_data, scale, n); 42 43 // Copy result back 44 auto result = py::array_t<float>(n); 45 py::buffer_info result_buf = result.request(); 46 cudaMemcpy(result_buf.ptr, d_data, n * sizeof(float), cudaMemcpyDeviceToHost); 47 48 cudaFree(d_data); 49 50 return result; 51 } 52 53 // ============================================================================= 54 // SAXPY: y = a*x + y on GPU 55 // ============================================================================= 56 57 py::array_t<float> saxpy(py::array_t<float> y, float a, py::array_t<float> x) { 58 py::buffer_info y_buf = y.request(); 59 py::buffer_info x_buf = x.request(); 60 61 if (y_buf.ndim != 1 || x_buf.ndim != 1) { 62 throw std::runtime_error("Arrays must be 1-dimensional"); 63 } 64 if (y_buf.size != x_buf.size) { 65 throw std::runtime_error("Arrays must have same size"); 66 } 67 68 int n = static_cast<int>(y_buf.size); 69 70 float *d_x, *d_y; 71 cudaMalloc(&d_x, n * sizeof(float)); 72 cudaMalloc(&d_y, n * sizeof(float)); 73 74 cudaMemcpy(d_x, x_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice); 75 cudaMemcpy(d_y, y_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice); 76 77 launch_saxpy(d_y, a, d_x, n); 78 79 auto result = py::array_t<float>(n); 80 cudaMemcpy(result.request().ptr, d_y, n * sizeof(float), cudaMemcpyDeviceToHost); 81 82 cudaFree(d_x); 83 cudaFree(d_y); 84 85 return result; 86 } 87 88 // ============================================================================= 89 // Dot product on GPU 90 // ============================================================================= 91 92 float dot(py::array_t<float> a, py::array_t<float> b) { 93 py::buffer_info a_buf = a.request(); 94 py::buffer_info b_buf = b.request(); 95 96 if (a_buf.ndim != 1 || b_buf.ndim != 1) { 97 throw std::runtime_error("Arrays must be 1-dimensional"); 98 } 99 if (a_buf.size != b_buf.size) { 100 throw std::runtime_error("Arrays must have same size"); 101 } 102 103 int n = static_cast<int>(a_buf.size); 104 105 float *d_a, *d_b, *d_result; 106 cudaMalloc(&d_a, n * sizeof(float)); 107 cudaMalloc(&d_b, n * sizeof(float)); 108 cudaMalloc(&d_result, sizeof(float)); 109 cudaMemset(d_result, 0, sizeof(float)); 110 111 cudaMemcpy(d_a, a_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice); 112 cudaMemcpy(d_b, b_buf.ptr, n * sizeof(float), cudaMemcpyHostToDevice); 113 114 launch_dot_product(d_result, d_a, d_b, n); 115 116 float result; 117 cudaMemcpy(&result, d_result, sizeof(float), cudaMemcpyDeviceToHost); 118 119 cudaFree(d_a); 120 cudaFree(d_b); 121 cudaFree(d_result); 122 123 return result; 124 } 125 126 // ============================================================================= 127 // Check if CUDA is available 128 // ============================================================================= 129 130 bool nv_available() { 131 int device_count = 0; 132 cudaError_t error = cudaGetDeviceCount(&device_count); 133 return (error == cudaSuccess && device_count > 0); 134 } 135 136 std::string nv_device_name() { 137 int device_count = 0; 138 if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0) { 139 return "No CUDA devices available"; 140 } 141 cudaDeviceProp props; 142 cudaGetDeviceProperties(&props, 0); 143 return std::string(props.name); 144 } 145 146 // ============================================================================= 147 // Module definition 148 // ============================================================================= 149 150 PYBIND11_MODULE(gpu_module, m) { 151 m.doc() = "GPU-accelerated operations via CUDA"; 152 153 m.def("nv_available", &nv_available, "Check if CUDA is available"); 154 155 m.def("nv_device_name", &nv_device_name, "Get the name of the CUDA device"); 156 157 m.def("scale_array", &scale_array, "Scale array elements on GPU", py::arg("input"), 158 py::arg("scale")); 159 160 m.def("saxpy", &saxpy, "SAXPY: y = a*x + y on GPU", py::arg("y"), py::arg("a"), py::arg("x")); 161 162 m.def("dot", &dot, "Dot product on GPU", py::arg("a"), py::arg("b")); 163 }