Cradicle Explorer

kernels.cpp
  1  /*
  2  implementations of these are exactly the same for CPU and CUDA -- reuse
  3  
  4  Temporarily commented out ops which don't yet have CUDA impl, so that compilation
  5  doesn't fail bc linker can't find _k expected by these ops
  6  */
  7  
  8  
  9  void add_bwd(tensor* upstream, tensor* out) {
 10      assert_input(upstream, out->num_dims);
 11  
 12      // out is an ouput of the op, it's used to
 13      // retrieve pointers to inputs tensors
 14      tensor* a = out->inputs[0];
 15      tensor* b = out->inputs[1];
 16  
 17      // local grad (note also allocates buff)
 18      tensor* a_local = TensorLikeFill(a, 1.0);
 19      tensor* b_local = TensorLikeFill(b, 1.0);
 20  
 21      // downstream = local * upstream
 22  
 23      // todo-now: do += instead of replacing existing grad
 24      a->grad = mul_k(a_local, upstream);
 25      b->grad = mul_k(b_local, upstream);
 26  }
 27  
 28  void sub_bwd(tensor* upstream, tensor* out) {
 29      assert_input(upstream, out->num_dims);
 30      tensor* a = out->inputs[0];
 31      tensor* b = out->inputs[1];
 32  
 33      // local
 34      tensor* a_local = TensorLikeFill(a, 1.0);
 35      tensor* b_local = TensorLikeFill(b, -1.0);
 36  
 37      // downstream = local * upstream
 38      tensor* curr_a_grad = mul_k(a_local, upstream);
 39      tensor* curr_b_grad = mul_k(b_local, upstream);
 40  
 41      maybe_init_grad(a);
 42      add_k_(a->grad, curr_a_grad, a->grad);
 43      maybe_init_grad(b);
 44      add_k_(b->grad, curr_b_grad, b->grad);
 45  }
 46  
 47  void mul_bwd(tensor* upstream, tensor* out) {
 48      assert_input(upstream, out->num_dims);
 49      tensor* a = out->inputs[0];
 50      tensor* b = out->inputs[1];
 51  
 52      // local
 53      tensor* a_local = b;
 54      tensor* b_local = a;
 55  
 56      // downstream = local * upstream
 57      a->grad = mul_k(a_local, upstream);
 58      b->grad = mul_k(b_local, upstream);
 59  }
 60  
 61  void matmul_bwd(tensor* upstream, tensor* out) {
 62      assert_input(upstream, out->num_dims);
 63      tensor* a = out->inputs[0];
 64      tensor* b = out->inputs[1];
 65  
 66      // 1. local
 67  
 68      // Even though you gonna store e.g. a transpose in a->grad (and not a itself)
 69      //  it does not make sense to allocate like below because the dims are logical but
 70      //  mem is contiguous anyway -- so for both of the constructors below obviously same memory will be allocated
 71      //    EmptyFloat(a->shape[1], a->shape[0]);   // reversed shapes to store Transpose
 72      //    EmptyFloatLike(a);
 73  
 74      // note: should allocate a_buff not with a.size but w b.size
 75      // note: using this intermediate variable instead of directly
 76      //  allocating "a->grad = EmptyFloatLike(b)", because the final
 77      //  a->grad is of shape a not of shape b. It would be incorrect
 78      //  to "a->grad = EmptyFloatLike(b)". So I keep these temporary
 79      //  variables (local_a, local_b) and de-allocate them after
 80      //  computing actual a->grad
 81  
 82      // upstream(N, D)   // same as t.shape
 83      //   a - ?
 84      //      a(N, M), so a_grad(N, M)
 85      //      upstream(N, D) @ b.t(D, M) = a_grad(N, M)
 86      //   b - ?
 87      //      b(M, D), so b_grad(M, D)
 88      //      a.t(M, N) @ upstream(N, D) = b_grad(M, D)
 89  
 90      tensor* local_a = transpose_k(b); // (M, D) -> (D, M)
 91      tensor* local_b = transpose_k(a); // (N, M) -> (M, N)
 92  
 93      // 2. wire local with upstream
 94      // upstream(N, D) @ b.t(D, M) = a_grad(N, M)
 95      a->grad = matmul_k(upstream, local_a);
 96      // a.t(M, N) @ upstream(N, D) = b_grad(M, D)
 97      b->grad = matmul_k(local_b, upstream);
 98  }
 99  
100  void div_bwd(tensor* upstream, tensor* out) {
101      assert_input(upstream, out->num_dims);
102      tensor* a = out->inputs[0];
103      tensor* b = out->inputs[1];
104  
105      // local
106      tensor* a_local = div_k(TensorLikeFill(a, 1.0), b);
107      tensor* b_local = neg_k(div_k(a, pow_k(b, 2)));
108  
109      // downstream
110  
111      // "a->grad = mul_k(a_local, upstream)" overwrite's input grad, the below does "+=" to it
112      maybe_init_grad(a);
113      maybe_init_grad(b);
114  
115      tensor* a_grad = mul_k(a_local, upstream);
116      tensor* b_grad = mul_k(b_local, upstream);
117      // does "+="
118      add_k_(a->grad, a_grad, a->grad);
119      add_k_(b->grad, b_grad, b->grad);
120  }
121  
122  void repeat_bwd(tensor* upstream, tensor* out) {
123      assert_input(upstream, out->num_dims);
124      tensor* a = out->inputs[0];
125  
126      int axis = out->non_grad_inputs[0];
127      if (axis==0){
128          // sum together each column of upstream
129          //  option 1: add "batched_reduce_sum_k(..., axis=0)"
130          //  option 2: to avoid modifying batched_reduce_sum, use: transpose -> batched_reduce_sum -> transpose
131  
132          // todo: maybe a cleaner way is to add "axis" argument to batched_reduce_sum,
133          //  but implement it in terms of transposes (no need to change the cuda kernel)
134  
135          tensor* transposed = transpose_k(upstream); // (num_repeats, N) -> (N, num_repeats)
136          tensor* reduced = batched_reduce_sum_k(transposed); // (N, 1)
137          a->grad = transpose_k(reduced); // (1, N)
138  
139      } else if (axis==1){
140          // sum together each row of upstream
141          a->grad = batched_reduce_sum_k(upstream);
142      }
143  }
144  
145  // reduce_sum: (B, N) -> (B, 1)
146  void batched_reduce_sum_bwd(tensor* upstream, tensor* out) {
147      assert_input(upstream, out->num_dims);
148      tensor* a = out->inputs[0]; // (B, N)
149  
150      // important to fill with 0's if we gonna "+=" to it below.
151      // If we instead simply overwrite it, then wouldn't matter,
152      // but bc we do "+=" it does matter (if there's any garbage
153      // data, the grad will be += to it)
154      maybe_init_grad(a);
155  
156      int N = a->shape[1];
157      tensor* local = TensorLikeFill(a, 1.0); // (B, 1)
158      tensor* upstream_broadcasted = repeat_k(upstream, /*axis = */ 1, /*num_repeats = */ N); // (B, 1) -> (B, N)
159      tensor* a_grad = mul_k(local, upstream_broadcasted);
160  
161      // add to existing a grad
162      add_k_(a->grad, a_grad, a->grad);
163  }
164  
165  void pow_bwd(tensor* upstream, tensor* out) {
166      assert_input(upstream, out->num_dims);
167      tensor* a = out->inputs[0];
168      // 1. local
169      // store local in grad in the grad field
170      // todo-low: mem leak
171      // todo: below assumes exponent=2 ?
172      tensor* local = mul_k(TensorLikeFill(a, 2.0), a);
173      // 2. wire local with upstream
174      a->grad = mul_k(local, upstream);
175  }
176  
177  void exp_bwd(tensor* upstream, tensor* out) {
178      assert_input(upstream, out->num_dims);
179      tensor* a = out->inputs[0];
180      tensor* local = out;
181      tensor* curr_a_grad = mul_k(local, upstream);
182      maybe_init_grad(a);
183      add_k_(a->grad, curr_a_grad, a->grad);
184  }
185  
186  void log_bwd(tensor* upstream, tensor* out) {
187      assert_input(upstream, out->num_dims);
188      tensor* a = out->inputs[0];
189      a->grad = TensorLikeFill(a, 0.0);
190  
191      // approximately 2.718282 C Math exp() Function e is the base of the natural system of logarithms (approximately 2.718282)
192      // Some implementations of the <math. h> library include a constant M_E
193      float log_e = logf(M_E);
194      // non-vectorized form: "(1/a) * log_e;"
195      tensor* local = mul_k(div_k(TensorLikeFill(a, 1), a), TensorLikeFill(a, log_e));
196      mul_k_(local, upstream, a->grad);
197  }
198  
199  void reduce_sum_bwd(tensor* upstream, tensor* out) {
200      assert_input(upstream, out->num_dims);
201      tensor* a = out->inputs[0];
202      // 1. local
203      tensor* local = TensorLikeFill(a, 1.0);
204      // 2. wire local with upstream
205      // make upstream and local to be same shape (currently upstream is a scalar, while local is a 2d tensor)
206  
207      // Note: cannot just "upstream->data[0]", because the data can be on CUDA;
208      // Also there's no need to call "COPY_TO_DEVICE", as it will be called from TensorLikeFill
209      tensor* upstream_host = COPY_FROM_DEVICE(upstream);
210  
211      tensor* broadcasted_upstream = TensorLikeFill(a, upstream_host->data[0]);
212      a->grad = mul_k(local, broadcasted_upstream);
213  }
214  
215  // todo-high: it doesn't make sense to have a transpose_bwd bc it just calls transpose -- at the moment this fn is used bc calling convention (args signature) for _bwd funcs is different from the fwd funcs
216  // todo: bwd formula
217  void transpose_bwd(tensor* upstream, tensor* out) {
218      assert_input(upstream, out->num_dims);
219      tensor* a = out->inputs[0];
220      a->grad = transpose_k(upstream);
221  }
222  
223  void neg_bwd(tensor* upstream, tensor* out) {
224      assert_input(upstream, out->num_dims);
225      tensor* a = out->inputs[0];
226      tensor* local = TensorLikeFill(a, -1.0);
227      a->grad = mul_k(local, upstream);
228  }
229  
230  // comment: shapes are same as matmul_bwd, but with additional (B,) dim first
231  void batched_matmul_bwd(tensor* upstream, tensor* out) {
232      assert_input(upstream, out->num_dims);
233      tensor* a = out->inputs[0];
234      tensor* b = out->inputs[1];
235  
236      // upstream(B, N, D)   // same as t.shape
237      //   a - ?
238      //      a(B, N, M), so a_grad(B, N, M)
239      //      upstream(B, N, D) @ b.t(B, D, M) = a_grad(B, N, M)
240      //   b - ?
241      //      b(B, M, D), so b_grad(B, M, D)
242      //      a.t(B, M, N) @ upstream(B, N, D) = b_grad(B, M, D)
243  
244      tensor* local_a = batched_transpose_k(b); // (B, M, D) -> (B, D, M)
245      tensor* local_b = batched_transpose_k(a); // (B, N, M) -> (B, M, N)
246  
247      // 2. wire local with upstream
248      // upstream(B, N, D) @ b.t(B, D, M) = a_grad(B, N, M)
249      a->grad = batched_matmul_k(upstream, local_a);
250      // a.t(B, M, N) @ upstream(B, N, D) = b_grad(B, M, D)
251      b->grad = batched_matmul_k(local_b, upstream);
252  }
253  
254  /*
255  answer-now:
256  these are almost identical and it makes sense: select_bwd batched_reduce_max_bwd -- for reduce_max_bwd you just need to make one more step to get the idxs, and for select_bwd they are already provided as arguments
257  
258  void select_bwd(tensor* upstream, tensor* out) {
259      idxs = out->inputs[1];
260      a->grad = TensorLikeFill(a, 0.0);
261      select_set(a->grad, idxs, 1);
262      mul_k(upstream, a->grad);
263  }
264  
265  void batched_reduce_max_bwd(tensor* upstream, tensor* out) {
266      idxs = // get_max_idx ...
267      a->grad = TensorLikeFill(a, 0.0);
268      select_set(a->grad, idxs, 1);
269      mul_k(upstream, a->grad);
270  }
271  */
272  
273  // a(B, N), idx(B, 1) = out(B, 1)
274  void select_bwd(tensor* upstream, tensor* out) {
275      assert_input(upstream, out->num_dims);
276      tensor* a = out->inputs[0];
277      tensor* idx = out->inputs[1];
278      int N = a->shape[1];
279  
280      // local
281      tensor* local = TensorLikeFill(a, 0.0); // (B, N)
282      select_set_(local, idx, 1.);
283  
284      // downstream
285      tensor* upstream_broadcasted = repeat_k(upstream, /*axis = */ 1, /*num_repeats = */ N); // (B, 1) -> (B, N);
286      tensor* a_grad = mul_k(local, upstream_broadcasted);
287  
288      // add to existing grad
289      maybe_init_grad(a);
290      add_k_(a->grad, a_grad, a->grad);
291  
292      // // todo: rm? If do, in autograd need to check "if inp->grad != NULL" before lprint'ing
293      // grad wrt idx
294      maybe_init_grad(idx);
295      local = TensorLikeFill(idx, 1.0);
296      tensor* idx_grad = mul_k(local, upstream);
297      add_k_(idx->grad, idx_grad, idx->grad);
298  }
299  
300  // made reduce_max_bwd shared between gpu and cpu by allowing kernels to record
301  // some additional info during fwd kernel, and simply access and use this field in bwd
302  
303  // todo-high: too many copy_to/from
304  void reduce_max_bwd(tensor* upstream, tensor* out) {
305      assert_input(upstream, out->num_dims);
306      tensor* a = out->inputs[0];
307      // todo-now: use select_set? This will also avoid needing to call COPY_TO_DEVICE, FROM_DEVICE here (when setting local->data[idx]=1)
308      int idx = (int)out->scratch_space[0]->data[0];
309      // 1. local
310      tensor* local = TensorLikeFill(a, 0.0);
311      tensor* local_host = COPY_FROM_DEVICE(local);
312      local_host->data[idx] = 1.0;
313      COPY_TO_DEVICE(local_host); // semantically this is "local"
314  
315      // 2. wire local with upstream
316      // copy to cpu before accessing t->data
317      tensor* upstream_host = COPY_FROM_DEVICE(upstream);
318      // make upstream and local to be same shape (currently upstream is a scalar, while local is a 2d tensor)
319      tensor* broadcasted_upstream = TensorLikeFill(a, upstream_host->data[0]);
320  
321      tensor* curr_a_grad = mul_k(local_host, broadcasted_upstream);
322      maybe_init_grad(a);
323      add_k_(a->grad, curr_a_grad, a->grad);
324  }
325  
326  // select: a(B, N), idx(B, 1) = out(B, 1)
327  void batched_reduce_max_bwd(tensor* upstream, tensor* out) {
328      assert_input(upstream, out->num_dims);
329      tensor* a = out->inputs[0];
330  
331      int N = a->shape[1];
332      tensor* idx = out->scratch_space[0]; // (B, 1)
333  
334      // local
335      // need to set to ones at these idxs, bc previously repeat_k will return a new tensor
336      // with these elements at indexes (from the original tensor) copied -- so it's not a
337      // view on the original elements and therefore can't set it by modifying output of
338      // select_k elements
339      tensor* local = TensorLikeFill(a, 0.0); // (B, N)
340      select_set_(local, idx, 1.);
341  
342      // downstream
343      tensor* upstream_broadcasted = repeat_k(upstream, /*axis = */ 1, /*num_repeats = */ N); // (B, 1) -> (B, N);
344      tensor* a_grad = mul_k(local, upstream_broadcasted);
345  
346      // add to existing grad
347      maybe_init_grad(a);
348      add_k_(a->grad, a_grad, a->grad);
349  }
350  
351  
352  void batched_flatten_bwd(tensor* upstream, tensor* out) {
353      assert_input(upstream, out->num_dims);
354      tensor* a = out->inputs[0];
355      maybe_init_grad(a);
356      // reshape upstream into the shape of a
357      unsafe_add_k_(a->grad, upstream, a->grad);
358  }
359  
360  
361  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fwd kernels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
362  
363  
364  tensor* batched_flatten_k(tensor* a) {
365      // ugly, but uses assert_input for contiguity and device check
366      assert_input(a, -1);
367      if (!a->num_dims==3 && !a->num_dims==4){
368          printf("[batched_flatten] Shape error\n");
369          exit(1);
370      }
371      int B = a->shape[0];
372  
373      // collapsing all dims of the tensor except B (0-th dim);
374      // this line is more concise then iterating over a.shape[1:] and multiplying them
375      int out_dim = a->size / a->shape[0];
376  
377      // inputs to this kernel can be 3d, 4d -- but the output is always 2d (all dims flattened except for the batch dim)
378      // todo: these copies aren't needed -- can just change strides on the upstream, but
379      //   in this case the rest of kernels need to support strided input (output of this kernel)
380      // todo: memory leak
381      tensor* out = TensorLikeFill(Tensor(B, out_dim), 0.);
382  
383      // use add_k_ on zero initialized out, to avoid needing to
384      // impl for-loop copy (_copy_arr) as separate primitive in both backends;
385      // 
386      // I ended up special casing this fn anyway (using "unsafe_add_k_" instead of using "add_k_"),
387      // bc the former only needs to assert same input->size, but the latter needs to assert
388      // same input->shape (more restrictive) -- but in general (for all other cuda kernels
389      // using _launch_binary_elementwise) the more restrictive checks are helpful
390      unsafe_add_k_(out, a, out);
391      return out;
392  }