/ ops.cpp
ops.cpp
1 #include "nn.h" 2 #include "indexing.cpp" 3 #include "utils.cpp" 4 #include "backends/select_backend.cpp" 5 6 // **** operations **** 7 // - shared between CUDA and CPU kernels 8 // - user-facing, unlike other abstractions 9 // - call primitives, in addition, record data for the autograd: 10 // - allocating grad buffers 11 // - write local derivatives, for each input 12 13 14 // todo-now: temporarily commented out ops which don't yet have CUDA impl, so that compilation doesn't fail bc linker can't find _k expected by these ops 15 16 tensor* add(tensor* a, tensor* b) { 17 a->num_uses++; 18 b->num_uses++; 19 tensor* t = add_k(a, b); 20 t->is_leaf = false; 21 // todo-low: check bool tensor->requires_grad before doing steps below, including allocating buffers 22 23 // todo: this can be further abstracted -- creating a binary_op function 24 // todo: and even further abstracted -- creating a binary_elementwise_op function 25 26 // fill the additional info on out tensor 27 t->num_inputs = 2; 28 t->inputs[0] = a; 29 t->inputs[1] = b; 30 t->op_type = 0; 31 t->grad_fn = add_bwd; 32 return t; 33 } 34 35 tensor* sub(tensor* a, tensor* b) { 36 a->num_uses++; 37 b->num_uses++; 38 tensor* t = sub_k(a, b); 39 t->is_leaf = false; 40 t->num_inputs = 2; 41 t->inputs[0] = a; 42 t->inputs[1] = b; 43 t->op_type = 1; 44 t->grad_fn = sub_bwd; 45 return t; 46 } 47 48 tensor* mul(tensor* a, tensor* b) { 49 a->num_uses++; 50 b->num_uses++; 51 tensor* t = mul_k(a, b); 52 t->is_leaf = false; 53 t->num_inputs = 2; 54 t->inputs[0] = a; 55 t->inputs[1] = b; 56 t->op_type = 2; 57 t->grad_fn = mul_bwd; 58 return t; 59 } 60 61 tensor* matmul(tensor* a, tensor* b) { 62 a->num_uses++; 63 b->num_uses++; 64 // e.g.: a(N, M) @ b(M, D) = t(N, D) 65 tensor* t = matmul_k(a, b); 66 t->is_leaf = false; 67 t->num_inputs = 2; 68 t->inputs[0] = a; 69 t->inputs[1] = b; 70 t->op_type = 3; 71 t->grad_fn = matmul_bwd; 72 return t; 73 } 74 75 tensor* div(tensor* a, tensor* b) { 76 a->num_uses++; 77 b->num_uses++; 78 tensor* t = div_k(a, b); 79 t->is_leaf = false; 80 t->num_inputs = 2; 81 t->inputs[0] = a; 82 t->inputs[1] = b; 83 t->op_type = 20; 84 t->grad_fn = div_bwd; 85 return t; 86 } 87 88 tensor* repeat(tensor* a, int axis, int num_repeats) { 89 a->num_uses++; 90 tensor* t = repeat_k(a, axis, num_repeats); 91 t->is_leaf = false; 92 t->num_inputs = 1; 93 t->inputs[0] = a; 94 t->non_grad_inputs[0] = axis; 95 t->non_grad_inputs[1] = num_repeats; 96 t->op_type = 18; 97 t->grad_fn = repeat_bwd; 98 return t; 99 } 100 101 tensor* select(tensor* a, tensor* idx) { 102 a->num_uses++; 103 idx->num_uses++; 104 tensor* t = select_k(a, idx); 105 t->is_leaf = false; 106 t->num_inputs = 2; 107 t->inputs[0] = a; 108 t->inputs[1] = idx; 109 t->op_type = 14; 110 t->grad_fn = select_bwd; 111 return t; 112 } 113 114 tensor* pow(tensor* a, int exponent) { 115 a->num_uses++; 116 tensor* t = pow_k(a, exponent); 117 t->is_leaf = false; 118 // comment: note by "inputs" I mean tensor inputs (INPUTS which I'll use compute grads wrt to) 119 // so here even if this op has two inputs, it really has one, for the purpose of the autograd 120 t->num_inputs = 1; 121 t->inputs[0] = a; 122 t->non_grad_inputs[0] = exponent; 123 t->op_type = 4; 124 t->grad_fn = pow_bwd; 125 return t; 126 } 127 128 tensor* relu(tensor* a) { 129 a->num_uses++; 130 tensor* t = relu_k(a); 131 t->is_leaf = false; 132 t->num_inputs = 1; 133 t->inputs[0] = a; 134 t->op_type = 6; 135 t->grad_fn = relu_bwd; 136 return t; 137 } 138 139 tensor* transpose(tensor* a) { 140 a->num_uses++; 141 tensor* t = transpose_k(a); 142 t->is_leaf = false; 143 t->num_inputs = 1; 144 t->inputs[0] = a; 145 t->op_type = 7; 146 t->grad_fn = transpose_bwd; 147 return t; 148 } 149 150 tensor* neg(tensor* a) { 151 a->num_uses++; 152 tensor* t = neg_k(a); 153 t->is_leaf = false; 154 t->num_inputs = 1; 155 t->inputs[0] = a; 156 t->op_type = 19; 157 t->grad_fn = neg_bwd; 158 return t; 159 } 160 161 // todo: does it conflict with C's math.exp ? 162 tensor* exp(tensor* a) { 163 a->num_uses++; 164 tensor* t = exp_k(a); 165 t->is_leaf = false; 166 t->num_inputs = 1; 167 t->inputs[0] = a; 168 t->op_type = 16; 169 t->grad_fn = exp_bwd; 170 return t; 171 } 172 173 tensor* log(tensor* a) { 174 a->num_uses++; 175 tensor* t = log_k(a); 176 t->is_leaf = false; 177 t->num_inputs = 1; 178 t->inputs[0] = a; 179 t->op_type = 15; 180 t->grad_fn = log_bwd; 181 return t; 182 } 183 184 tensor* batched_matmul(tensor* a, tensor* b) { 185 a->num_uses++; 186 b->num_uses++; 187 // e.g.: a(B, N, M) @ b(B, M, D) = t(B, N, D) 188 tensor* t = batched_matmul_k(a, b); 189 t->is_leaf = false; 190 t->num_inputs = 2; 191 t->inputs[0] = a; 192 t->inputs[1] = b; 193 t->op_type = 8; 194 t->grad_fn = batched_matmul_bwd; 195 return t; 196 } 197 198 tensor* batched_flatten(tensor* a) { 199 a->num_uses++; 200 tensor* t = batched_flatten_k(a); 201 t->is_leaf = false; 202 t->num_inputs = 1; 203 t->inputs[0] = a; 204 t->op_type = 13; 205 t->grad_fn = batched_flatten_bwd; 206 return t; 207 } 208 209 tensor* reduce_sum(tensor* a) { 210 a->num_uses++; 211 tensor* t = reduce_sum_k(a); 212 t->is_leaf = false; 213 t->num_inputs = 1; 214 t->inputs[0] = a; 215 t->op_type = 5; 216 t->grad_fn = reduce_sum_bwd; 217 return t; 218 } 219 220 tensor* batched_reduce_sum(tensor* a) { 221 a->num_uses++; 222 tensor* t = batched_reduce_sum_k(a); 223 t->is_leaf = false; 224 t->num_inputs = 1; 225 t->inputs[0] = a; 226 t->op_type = 17; 227 t->grad_fn = batched_reduce_sum_bwd; 228 return t; 229 } 230 231 tensor* reduce_max(tensor* a) { 232 a->num_uses++; 233 tensor* t = reduce_max_k(a); 234 t->is_leaf = false; 235 t->num_inputs = 1; 236 t->inputs[0] = a; 237 t->op_type = 21; 238 t->grad_fn = reduce_max_bwd; 239 return t; 240 } 241 242 tensor* batched_reduce_max(tensor* a) { 243 a->num_uses++; 244 tensor* t = batched_reduce_max_k(a); 245 t->is_leaf = false; 246 t->num_inputs = 1; 247 t->inputs[0] = a; 248 t->op_type = 22; 249 t->grad_fn = batched_reduce_max_bwd; 250 return t; 251 } 252 253 tensor* conv(tensor* input, tensor* kernel, tensor* bias) { 254 input->num_uses++; 255 kernel->num_uses++; 256 bias->num_uses++; 257 tensor* t = conv_k(input, kernel, bias); 258 t->is_leaf = false; 259 t->num_inputs = 3; 260 t->inputs[0] = input; 261 t->inputs[1] = kernel; 262 t->inputs[2] = bias; 263 t->op_type = 9; 264 t->grad_fn = bwd_conv_k; 265 return t; 266 } 267 268 tensor* batched_conv(tensor* input, tensor* kernel, tensor* bias) { 269 input->num_uses++; 270 kernel->num_uses++; 271 bias->num_uses++; 272 tensor* t = batched_conv_k(input, kernel, bias); 273 t->is_leaf = false; 274 t->num_inputs = 3; 275 t->inputs[0] = input; 276 t->inputs[1] = kernel; 277 t->inputs[2] = bias; 278 t->op_type = 10; 279 t->grad_fn = bwd_batched_conv_k; 280 return t; 281 } 282 283 tensor* maxpool(tensor* input) { 284 input->num_uses++; 285 tensor* t = maxpool_k(input); 286 t->is_leaf = false; 287 t->num_inputs = 1; 288 t->inputs[0] = input; 289 t->op_type = 11; 290 t->grad_fn = bwd_maxpool_k; 291 return t; 292 } 293 294 tensor* batched_maxpool(tensor* input) { 295 input->num_uses++; 296 tensor* t = batched_maxpool_k(input); 297 t->is_leaf = false; 298 t->num_inputs = 1; 299 t->inputs[0] = input; 300 t->op_type = 12; 301 t->grad_fn = bwd_batched_maxpool_k; 302 return t; 303 }