kernels.cpp
1 /* 2 implementations of these are exactly the same for CPU and CUDA -- reuse 3 4 Temporarily commented out ops which don't yet have CUDA impl, so that compilation 5 doesn't fail bc linker can't find _k expected by these ops 6 */ 7 8 9 void add_bwd(tensor* upstream, tensor* out) { 10 assert_input(upstream, out->num_dims); 11 12 // out is an ouput of the op, it's used to 13 // retrieve pointers to inputs tensors 14 tensor* a = out->inputs[0]; 15 tensor* b = out->inputs[1]; 16 17 // local grad (note also allocates buff) 18 tensor* a_local = TensorLikeFill(a, 1.0); 19 tensor* b_local = TensorLikeFill(b, 1.0); 20 21 // downstream = local * upstream 22 23 // todo-now: do += instead of replacing existing grad 24 a->grad = mul_k(a_local, upstream); 25 b->grad = mul_k(b_local, upstream); 26 } 27 28 void sub_bwd(tensor* upstream, tensor* out) { 29 assert_input(upstream, out->num_dims); 30 tensor* a = out->inputs[0]; 31 tensor* b = out->inputs[1]; 32 33 // local 34 tensor* a_local = TensorLikeFill(a, 1.0); 35 tensor* b_local = TensorLikeFill(b, -1.0); 36 37 // downstream = local * upstream 38 tensor* curr_a_grad = mul_k(a_local, upstream); 39 tensor* curr_b_grad = mul_k(b_local, upstream); 40 41 maybe_init_grad(a); 42 add_k_(a->grad, curr_a_grad, a->grad); 43 maybe_init_grad(b); 44 add_k_(b->grad, curr_b_grad, b->grad); 45 } 46 47 void mul_bwd(tensor* upstream, tensor* out) { 48 assert_input(upstream, out->num_dims); 49 tensor* a = out->inputs[0]; 50 tensor* b = out->inputs[1]; 51 52 // local 53 tensor* a_local = b; 54 tensor* b_local = a; 55 56 // downstream = local * upstream 57 a->grad = mul_k(a_local, upstream); 58 b->grad = mul_k(b_local, upstream); 59 } 60 61 void matmul_bwd(tensor* upstream, tensor* out) { 62 assert_input(upstream, out->num_dims); 63 tensor* a = out->inputs[0]; 64 tensor* b = out->inputs[1]; 65 66 // 1. local 67 68 // Even though you gonna store e.g. a transpose in a->grad (and not a itself) 69 // it does not make sense to allocate like below because the dims are logical but 70 // mem is contiguous anyway -- so for both of the constructors below obviously same memory will be allocated 71 // EmptyFloat(a->shape[1], a->shape[0]); // reversed shapes to store Transpose 72 // EmptyFloatLike(a); 73 74 // note: should allocate a_buff not with a.size but w b.size 75 // note: using this intermediate variable instead of directly 76 // allocating "a->grad = EmptyFloatLike(b)", because the final 77 // a->grad is of shape a not of shape b. It would be incorrect 78 // to "a->grad = EmptyFloatLike(b)". So I keep these temporary 79 // variables (local_a, local_b) and de-allocate them after 80 // computing actual a->grad 81 82 // upstream(N, D) // same as t.shape 83 // a - ? 84 // a(N, M), so a_grad(N, M) 85 // upstream(N, D) @ b.t(D, M) = a_grad(N, M) 86 // b - ? 87 // b(M, D), so b_grad(M, D) 88 // a.t(M, N) @ upstream(N, D) = b_grad(M, D) 89 90 tensor* local_a = transpose_k(b); // (M, D) -> (D, M) 91 tensor* local_b = transpose_k(a); // (N, M) -> (M, N) 92 93 // 2. wire local with upstream 94 // upstream(N, D) @ b.t(D, M) = a_grad(N, M) 95 a->grad = matmul_k(upstream, local_a); 96 // a.t(M, N) @ upstream(N, D) = b_grad(M, D) 97 b->grad = matmul_k(local_b, upstream); 98 } 99 100 void div_bwd(tensor* upstream, tensor* out) { 101 assert_input(upstream, out->num_dims); 102 tensor* a = out->inputs[0]; 103 tensor* b = out->inputs[1]; 104 105 // local 106 tensor* a_local = div_k(TensorLikeFill(a, 1.0), b); 107 tensor* b_local = neg_k(div_k(a, pow_k(b, 2))); 108 109 // downstream 110 111 // "a->grad = mul_k(a_local, upstream)" overwrite's input grad, the below does "+=" to it 112 maybe_init_grad(a); 113 maybe_init_grad(b); 114 115 tensor* a_grad = mul_k(a_local, upstream); 116 tensor* b_grad = mul_k(b_local, upstream); 117 // does "+=" 118 add_k_(a->grad, a_grad, a->grad); 119 add_k_(b->grad, b_grad, b->grad); 120 } 121 122 void repeat_bwd(tensor* upstream, tensor* out) { 123 assert_input(upstream, out->num_dims); 124 tensor* a = out->inputs[0]; 125 126 int axis = out->non_grad_inputs[0]; 127 if (axis==0){ 128 // sum together each column of upstream 129 // option 1: add "batched_reduce_sum_k(..., axis=0)" 130 // option 2: to avoid modifying batched_reduce_sum, use: transpose -> batched_reduce_sum -> transpose 131 132 // todo: maybe a cleaner way is to add "axis" argument to batched_reduce_sum, 133 // but implement it in terms of transposes (no need to change the cuda kernel) 134 135 tensor* transposed = transpose_k(upstream); // (num_repeats, N) -> (N, num_repeats) 136 tensor* reduced = batched_reduce_sum_k(transposed); // (N, 1) 137 a->grad = transpose_k(reduced); // (1, N) 138 139 } else if (axis==1){ 140 // sum together each row of upstream 141 a->grad = batched_reduce_sum_k(upstream); 142 } 143 } 144 145 // reduce_sum: (B, N) -> (B, 1) 146 void batched_reduce_sum_bwd(tensor* upstream, tensor* out) { 147 assert_input(upstream, out->num_dims); 148 tensor* a = out->inputs[0]; // (B, N) 149 150 // important to fill with 0's if we gonna "+=" to it below. 151 // If we instead simply overwrite it, then wouldn't matter, 152 // but bc we do "+=" it does matter (if there's any garbage 153 // data, the grad will be += to it) 154 maybe_init_grad(a); 155 156 int N = a->shape[1]; 157 tensor* local = TensorLikeFill(a, 1.0); // (B, 1) 158 tensor* upstream_broadcasted = repeat_k(upstream, /*axis = */ 1, /*num_repeats = */ N); // (B, 1) -> (B, N) 159 tensor* a_grad = mul_k(local, upstream_broadcasted); 160 161 // add to existing a grad 162 add_k_(a->grad, a_grad, a->grad); 163 } 164 165 void pow_bwd(tensor* upstream, tensor* out) { 166 assert_input(upstream, out->num_dims); 167 tensor* a = out->inputs[0]; 168 // 1. local 169 // store local in grad in the grad field 170 // todo-low: mem leak 171 // todo: below assumes exponent=2 ? 172 tensor* local = mul_k(TensorLikeFill(a, 2.0), a); 173 // 2. wire local with upstream 174 a->grad = mul_k(local, upstream); 175 } 176 177 void exp_bwd(tensor* upstream, tensor* out) { 178 assert_input(upstream, out->num_dims); 179 tensor* a = out->inputs[0]; 180 tensor* local = out; 181 tensor* curr_a_grad = mul_k(local, upstream); 182 maybe_init_grad(a); 183 add_k_(a->grad, curr_a_grad, a->grad); 184 } 185 186 void log_bwd(tensor* upstream, tensor* out) { 187 assert_input(upstream, out->num_dims); 188 tensor* a = out->inputs[0]; 189 a->grad = TensorLikeFill(a, 0.0); 190 191 // approximately 2.718282 C Math exp() Function e is the base of the natural system of logarithms (approximately 2.718282) 192 // Some implementations of the <math. h> library include a constant M_E 193 float log_e = logf(M_E); 194 // non-vectorized form: "(1/a) * log_e;" 195 tensor* local = mul_k(div_k(TensorLikeFill(a, 1), a), TensorLikeFill(a, log_e)); 196 mul_k_(local, upstream, a->grad); 197 } 198 199 void reduce_sum_bwd(tensor* upstream, tensor* out) { 200 assert_input(upstream, out->num_dims); 201 tensor* a = out->inputs[0]; 202 // 1. local 203 tensor* local = TensorLikeFill(a, 1.0); 204 // 2. wire local with upstream 205 // make upstream and local to be same shape (currently upstream is a scalar, while local is a 2d tensor) 206 207 // Note: cannot just "upstream->data[0]", because the data can be on CUDA; 208 // Also there's no need to call "COPY_TO_DEVICE", as it will be called from TensorLikeFill 209 tensor* upstream_host = COPY_FROM_DEVICE(upstream); 210 211 tensor* broadcasted_upstream = TensorLikeFill(a, upstream_host->data[0]); 212 a->grad = mul_k(local, broadcasted_upstream); 213 } 214 215 // todo-high: it doesn't make sense to have a transpose_bwd bc it just calls transpose -- at the moment this fn is used bc calling convention (args signature) for _bwd funcs is different from the fwd funcs 216 // todo: bwd formula 217 void transpose_bwd(tensor* upstream, tensor* out) { 218 assert_input(upstream, out->num_dims); 219 tensor* a = out->inputs[0]; 220 a->grad = transpose_k(upstream); 221 } 222 223 void neg_bwd(tensor* upstream, tensor* out) { 224 assert_input(upstream, out->num_dims); 225 tensor* a = out->inputs[0]; 226 tensor* local = TensorLikeFill(a, -1.0); 227 a->grad = mul_k(local, upstream); 228 } 229 230 // comment: shapes are same as matmul_bwd, but with additional (B,) dim first 231 void batched_matmul_bwd(tensor* upstream, tensor* out) { 232 assert_input(upstream, out->num_dims); 233 tensor* a = out->inputs[0]; 234 tensor* b = out->inputs[1]; 235 236 // upstream(B, N, D) // same as t.shape 237 // a - ? 238 // a(B, N, M), so a_grad(B, N, M) 239 // upstream(B, N, D) @ b.t(B, D, M) = a_grad(B, N, M) 240 // b - ? 241 // b(B, M, D), so b_grad(B, M, D) 242 // a.t(B, M, N) @ upstream(B, N, D) = b_grad(B, M, D) 243 244 tensor* local_a = batched_transpose_k(b); // (B, M, D) -> (B, D, M) 245 tensor* local_b = batched_transpose_k(a); // (B, N, M) -> (B, M, N) 246 247 // 2. wire local with upstream 248 // upstream(B, N, D) @ b.t(B, D, M) = a_grad(B, N, M) 249 a->grad = batched_matmul_k(upstream, local_a); 250 // a.t(B, M, N) @ upstream(B, N, D) = b_grad(B, M, D) 251 b->grad = batched_matmul_k(local_b, upstream); 252 } 253 254 /* 255 answer-now: 256 these are almost identical and it makes sense: select_bwd batched_reduce_max_bwd -- for reduce_max_bwd you just need to make one more step to get the idxs, and for select_bwd they are already provided as arguments 257 258 void select_bwd(tensor* upstream, tensor* out) { 259 idxs = out->inputs[1]; 260 a->grad = TensorLikeFill(a, 0.0); 261 select_set(a->grad, idxs, 1); 262 mul_k(upstream, a->grad); 263 } 264 265 void batched_reduce_max_bwd(tensor* upstream, tensor* out) { 266 idxs = // get_max_idx ... 267 a->grad = TensorLikeFill(a, 0.0); 268 select_set(a->grad, idxs, 1); 269 mul_k(upstream, a->grad); 270 } 271 */ 272 273 // a(B, N), idx(B, 1) = out(B, 1) 274 void select_bwd(tensor* upstream, tensor* out) { 275 assert_input(upstream, out->num_dims); 276 tensor* a = out->inputs[0]; 277 tensor* idx = out->inputs[1]; 278 int N = a->shape[1]; 279 280 // local 281 tensor* local = TensorLikeFill(a, 0.0); // (B, N) 282 select_set_(local, idx, 1.); 283 284 // downstream 285 tensor* upstream_broadcasted = repeat_k(upstream, /*axis = */ 1, /*num_repeats = */ N); // (B, 1) -> (B, N); 286 tensor* a_grad = mul_k(local, upstream_broadcasted); 287 288 // add to existing grad 289 maybe_init_grad(a); 290 add_k_(a->grad, a_grad, a->grad); 291 292 // // todo: rm? If do, in autograd need to check "if inp->grad != NULL" before lprint'ing 293 // grad wrt idx 294 maybe_init_grad(idx); 295 local = TensorLikeFill(idx, 1.0); 296 tensor* idx_grad = mul_k(local, upstream); 297 add_k_(idx->grad, idx_grad, idx->grad); 298 } 299 300 // made reduce_max_bwd shared between gpu and cpu by allowing kernels to record 301 // some additional info during fwd kernel, and simply access and use this field in bwd 302 303 // todo-high: too many copy_to/from 304 void reduce_max_bwd(tensor* upstream, tensor* out) { 305 assert_input(upstream, out->num_dims); 306 tensor* a = out->inputs[0]; 307 // todo-now: use select_set? This will also avoid needing to call COPY_TO_DEVICE, FROM_DEVICE here (when setting local->data[idx]=1) 308 int idx = (int)out->scratch_space[0]->data[0]; 309 // 1. local 310 tensor* local = TensorLikeFill(a, 0.0); 311 tensor* local_host = COPY_FROM_DEVICE(local); 312 local_host->data[idx] = 1.0; 313 COPY_TO_DEVICE(local_host); // semantically this is "local" 314 315 // 2. wire local with upstream 316 // copy to cpu before accessing t->data 317 tensor* upstream_host = COPY_FROM_DEVICE(upstream); 318 // make upstream and local to be same shape (currently upstream is a scalar, while local is a 2d tensor) 319 tensor* broadcasted_upstream = TensorLikeFill(a, upstream_host->data[0]); 320 321 tensor* curr_a_grad = mul_k(local_host, broadcasted_upstream); 322 maybe_init_grad(a); 323 add_k_(a->grad, curr_a_grad, a->grad); 324 } 325 326 // select: a(B, N), idx(B, 1) = out(B, 1) 327 void batched_reduce_max_bwd(tensor* upstream, tensor* out) { 328 assert_input(upstream, out->num_dims); 329 tensor* a = out->inputs[0]; 330 331 int N = a->shape[1]; 332 tensor* idx = out->scratch_space[0]; // (B, 1) 333 334 // local 335 // need to set to ones at these idxs, bc previously repeat_k will return a new tensor 336 // with these elements at indexes (from the original tensor) copied -- so it's not a 337 // view on the original elements and therefore can't set it by modifying output of 338 // select_k elements 339 tensor* local = TensorLikeFill(a, 0.0); // (B, N) 340 select_set_(local, idx, 1.); 341 342 // downstream 343 tensor* upstream_broadcasted = repeat_k(upstream, /*axis = */ 1, /*num_repeats = */ N); // (B, 1) -> (B, N); 344 tensor* a_grad = mul_k(local, upstream_broadcasted); 345 346 // add to existing grad 347 maybe_init_grad(a); 348 add_k_(a->grad, a_grad, a->grad); 349 } 350 351 352 void batched_flatten_bwd(tensor* upstream, tensor* out) { 353 assert_input(upstream, out->num_dims); 354 tensor* a = out->inputs[0]; 355 maybe_init_grad(a); 356 // reshape upstream into the shape of a 357 unsafe_add_k_(a->grad, upstream, a->grad); 358 } 359 360 361 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fwd kernels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 362 363 364 tensor* batched_flatten_k(tensor* a) { 365 // ugly, but uses assert_input for contiguity and device check 366 assert_input(a, -1); 367 if (!a->num_dims==3 && !a->num_dims==4){ 368 printf("[batched_flatten] Shape error\n"); 369 exit(1); 370 } 371 int B = a->shape[0]; 372 373 // collapsing all dims of the tensor except B (0-th dim); 374 // this line is more concise then iterating over a.shape[1:] and multiplying them 375 int out_dim = a->size / a->shape[0]; 376 377 // inputs to this kernel can be 3d, 4d -- but the output is always 2d (all dims flattened except for the batch dim) 378 // todo: these copies aren't needed -- can just change strides on the upstream, but 379 // in this case the rest of kernels need to support strided input (output of this kernel) 380 // todo: memory leak 381 tensor* out = TensorLikeFill(Tensor(B, out_dim), 0.); 382 383 // use add_k_ on zero initialized out, to avoid needing to 384 // impl for-loop copy (_copy_arr) as separate primitive in both backends; 385 // 386 // I ended up special casing this fn anyway (using "unsafe_add_k_" instead of using "add_k_"), 387 // bc the former only needs to assert same input->size, but the latter needs to assert 388 // same input->shape (more restrictive) -- but in general (for all other cuda kernels 389 // using _launch_binary_elementwise) the more restrictive checks are helpful 390 unsafe_add_k_(out, a, out); 391 return out; 392 }