/ backends / cpu / kernels.cpp
kernels.cpp
  1  #include <math.h> // log, pow
  2  
  3  
  4  tensor* neg_k(tensor*);
  5  tensor* pow_k(tensor*, int);
  6  tensor* transpose_k(tensor*);
  7  tensor* mul_k(tensor*, tensor*);
  8  tensor* batched_transpose_k(tensor*);
  9  tensor* batched_reduce_sum_k(tensor*);
 10  
 11  
 12  // env though this fn is called assert_device, it's checks for cpu -- this naming is to keep parity between the two backends (to reuse common/asserts)
 13  void assert_device(tensor* a){
 14      if (a->device!=CPU){
 15          printf("[assert_device] Error: expected device cpu. Saw: %i\n", a->device);
 16          exit(1);
 17      }
 18  }
 19  
 20  
 21  // todo:
 22  // - to preserve old behavior, change ops to:
 23  //     - index from the right (inner most dims), NOT from the left (outer most)
 24  //     - loop over the outer-most dim as well
 25  // - IOW, in nn.h when increase shape from 3 to 4, all the existing ops (indexing from left) might break?
 26  // - On the other hand, the kernels are specialized to particular num dims -- so will break anyway
 27  
 28  // note: naming convention: assume "_elementwise", by default -- and ask to specify otherwise (if not elementwise) -- e.g. reduce_, matrix_, etc
 29  // todo: ? use capital letters for functions that allocate heap mem, use lowercase otherwise
 30  // note: below derivatives try to follow this order -- local * upstream
 31  
 32  // **** kernels *****
 33  //   - operate on tensors
 34  //   - allocate ouput buff
 35  //      - return new tensor
 36  
 37  
 38  //    binary elementwise
 39  
 40  
 41  
 42  tensor* add_k_(tensor* a, tensor* b, tensor* out) {
 43      assert_binary_elementwise_non_contiguous(a, b);
 44  
 45      // Previously was hardcoding a at for a specific index here -- conv_bwd uses add (which calls add_k_) for both 3d tensors (grads wrt input) AND 4d tensors (grads wrt kernels)
 46      // todo: here and in all ops, assert same size
 47      for (int i=0; i<out->size; i++){
 48          // comment: note incrementing index like this in a loop, assumes contiguous data
 49          // out->data[i] = a->data[i] + b->data[i];
 50          out->data[at(out, i)] = a->data[at(a, i)] + b->data[at(b, i)];
 51      }
 52      return out;
 53  }
 54  
 55  tensor* unsafe_add_k_(tensor* a, tensor* b, tensor* out) {
 56      assert_device(a);
 57      assert_device(b);
 58      assert_same_size(a, b);
 59      for (int i=0; i<out->size; i++){
 60          // comment: removed at, becuase at only supports 2d and 3d, but from batched_flatten_k I'm passing a 4d tensor
 61          // out->data[at(out, i)] = a->data[at(a, i)] + b->data[at(b, i)];
 62          out->data[i] = a->data[i] + b->data[i];
 63      }
 64      return out;
 65  }
 66  
 67  
 68  // todo: use "const tensor*" instead of "tensor"
 69  tensor* add_k(tensor* a, tensor* b) {
 70      tensor* out = TensorLike(a);
 71      return add_k_(a, b, out);
 72  }
 73  
 74  tensor* sub_k_(tensor* a, tensor* b, tensor* out) {
 75      assert_binary_elementwise(a, b);
 76      for (int i=0; i<out->size; i++){
 77          out->data[i] = a->data[i] - b->data[i];
 78      }
 79      return out;
 80  }
 81  
 82  tensor* sub_k(tensor* a, tensor* b) {
 83      tensor* out = TensorLike(a);
 84      return sub_k_(a, b, out);
 85  }
 86  
 87  tensor* mul_k_(tensor* a, tensor* b, tensor* out) {
 88      assert_binary_elementwise(a, b);
 89      for (int i=0; i<out->size; i++){
 90          out->data[i] = a->data[i] * b->data[i];
 91      }
 92      return out;
 93  }
 94  
 95  tensor* mul_k(tensor* a, tensor* b) {
 96      tensor* out = TensorLike(a);
 97      return mul_k_(a, b, out);
 98  }
 99  
100  
101  //    binary
102  
103  
104  // a.shape (N, M)
105  // b.shape (M, D)
106  // out.shape (N, D)
107  void matmul_k_(tensor* a, tensor* b, tensor* out) {
108      // supports non-contiguous
109      assert_device(a);
110      assert_device(b);
111      assert_dim(a, 2);
112      assert_dim(b, 2);
113      if (a->shape[1] != b->shape[0]){
114          printf("[matmul_k_] Error: inner dim doesn't match, saw: a(%i, %i) b(%i, %i)\n", a->shape[0], a->shape[1], b->shape[0], b->shape[1]);
115          exit(1);
116      }
117  
118      // todo: add assert that num dims in inputs == 2
119  
120      int N = a->shape[0], M = a->shape[1];
121      int D = b->shape[1];
122  
123      // (N, M) @ (M, D) = (N, D)
124      for (int n=0; n<N; n++){
125          for (int d=0; d<D; d++){
126              float sum = 0;
127              for (int m=0; m<M; m++){
128                  // n*M, because for each "n" you skip "M" values
129  
130                  // uses index_ instead of "a->data[n * M + m * 1]" as to not assume strides
131                  // printf("index_2d(a, n, m): %i\n", index_2d(a, n, m));
132                  sum += a->data[index_2d(a, n, m)] * b->data[index_2d(b, m, d)];
133              }
134              out->data[index_2d(out, n, d)] = sum;
135          }
136      }
137  }
138  
139  // a.shape (N, M)
140  // b.shape (M, D)
141  // out.shape (N, D)
142  tensor* matmul_k(tensor* a, tensor* b) {
143      int N = a->shape[0], D = b->shape[1];
144      tensor* out = Tensor(N, D);
145      matmul_k_(a, b, out);
146      return out;
147  }
148  
149  
150  tensor* div_k(tensor* a, tensor* b) {
151      assert_binary_elementwise(a, b);
152      tensor* out = TensorLike(a);
153      for (int i=0; i<out->size; i++){
154          out->data[i] = a->data[i] / b->data[i];
155      }
156      return out;
157  }
158  
159  
160  tensor* repeat_k(tensor* a, int axis, int num_repeats) {
161      assert_input(a, 2);
162      if (axis != 0 && axis != 1){
163          printf("[repeat] Unexpected axis\n");
164          exit(1);
165      }
166      if (a->shape[axis] != 1){
167          printf("[repeat] Shape error\n");
168          exit(1);
169      }
170  
171      tensor* out;
172  
173      // a.shape (1, N)
174      if (axis==0){
175          int N = a->shape[1];
176          out = Tensor(num_repeats, N);
177  
178          for (int b=0; b<num_repeats; b++){
179              // points to the first element of the current b
180              float* curr_out = out->data + (b * out->stride[0]);
181              for (int i=0; i<N; i++){
182                  // for curr_a multiplying by "a->stride[0]" is not needed bc curr_a is (1, N)
183                  *(curr_out+i) = *(a->data+i);
184              }
185          }
186  
187      // a.shape (B, 1)
188      } else if (axis==1){
189          int B = a->shape[0];
190          out = Tensor(B, num_repeats);
191  
192          // 1 1 1
193          // 2 2 2
194          // 3 ...
195          for (int b=0; b<B; b++){
196              // points to the first element of the current b
197              float* curr_a = a->data + b;
198              // here indexing includes multiplying by "out->stride[0]" bc
199              // out is a 2d tensor (for curr_a multiplying by "a->stride[0]" is
200              // not needed bc curr_a is (B, 1))
201              float* curr_out = out->data + (b * out->stride[0]);
202              for (int i=0; i<num_repeats; i++){
203                  *(curr_out+i) = *(curr_a);
204              }
205          }
206      }
207      return out;
208  }
209  
210  
211  tensor* select_k(tensor* a, tensor* idx) {
212      // expect shapes:
213      //  a(s1, s2)
214      //  idx(s1, 1)
215      //      also each element of idx's first dim should be in range 0-s2
216      //      (bc the first dim in idx will be used to index  the 2nd dim of a)
217      //      this latter condition is not being checked here
218      assert_input(a, 2);
219      assert_input(idx, 2);
220      if (idx->shape[1]!=1 || idx->shape[0]!=a->shape[0]) {
221          printf("[select] Error shape");
222          exit(1);
223      }
224  
225      int B = a->shape[0];
226      tensor* out = Tensor(B, 1);
227  
228      for (int b=0; b<B; b++){
229          float* curr_a = a->data + (a->stride[0]*b);
230          // note: additional constraint on input: idx->data should be ints;
231          // below:
232          //   1) take pointer to the *first* element of the current batch example in a (curr_a)
233          //   2) add it idx for the current batch example (currently all tensors are floats, so need to cast idx->data to int before doing it)
234          //   3) the result of the above is a pointer, so de-reference it
235          out->data[b] = *(curr_a + (int)idx->data[b]);
236          // printf("curr_idx: %i", (int)idx->data[b]);
237      }
238      return out;
239  }
240  
241  
242  void select_set_(tensor* a, tensor* idx, int value) {
243      assert_input(a, 2);
244      assert_input(idx, 2);
245      if (idx->shape[1]!=1 || idx->shape[0]!=a->shape[0]) {
246          printf("[select_set_] Error shape");
247          exit(1);
248      }
249      int B = a->shape[0];
250      for (int b=0; b<B; b++){
251          float* curr_a = a->data + (a->stride[0]*b);
252          *(curr_a + (int)idx->data[b]) = value;
253      }
254  }
255  
256  
257  //    unary
258  
259  tensor* sqrt_k(tensor* a) {
260      assert_input(a, -1);
261      tensor* out = TensorLike(a);
262      for (int i=0; i<out->size; i++){
263          out->data[i] = (float)sqrt(a->data[i]);
264      }
265      return out;
266  }
267  
268  tensor* pow_k(tensor* a, int exponent) {
269      // supports n-dim input but must be contiguous
270      // (because of the direct access to its data below)
271      assert_input(a, -1);
272  
273      tensor* out = TensorLikeFill(a, 0.0);
274      for (int i=0; i<out->size; i++){
275          // pow here refers to C's math function, not tiny-torch's op
276          out->data[i] = (float)pow(a->data[i], exponent);
277      }
278      return out;
279  }
280  
281  
282  tensor* reduce_sum_k(tensor* a) {
283      assert_input(a, -1);
284  
285      // reduce to scalar
286      tensor* out = TensorScalarFill(0.0);
287      for (int i=0; i<a->size; i++){
288          out->data[0] += a->data[i];
289      }
290      return out;
291  }
292  
293  
294  tensor* relu_k(tensor* a) {
295      assert_input(a, -1);
296  
297      tensor* out = TensorLike(a);
298      // todo: this kernel is more complicated to record idxs into
299      // scratch space, bc don't know size of this tensor in advance (data-dependant size)
300      // tensor* scratch_space = TensorLikeFill(out, 0.);
301      for (int i=0; i<out->size; i++){
302          if (a->data[i] < 0.0){
303              out->data[i] = 0.0;
304              // scratch_space->data[]
305          } else {
306              out->data[i] = a->data[i];
307          }
308      }
309      // out->scratch_space[0] = scratch_space;
310      return out;
311  }
312  
313  void relu_bwd(tensor* upstream, tensor* out) {
314      assert_input(upstream, out->num_dims);
315      tensor* a = out->inputs[0];
316      // local
317      // todo: avoid re-computing
318      //  this is kind of gradient checkpointing;
319      //  ofc can avoid by making original relu ouput the mask
320      //  -- but I found recomputing is a bit cleaner to explain
321  
322      tensor* local = TensorLike(a);
323      for (int i=0; i<local->size; i++) {
324          local->data[i] = a->data[i] > 0.0 ? 1.0 : 0.0;
325      }
326  
327      a->grad = mul_k(local, upstream);
328  }
329  
330  
331  // todo-high:
332  // By modifying strides, for example, an array can be transposed
333  // or reshaped at zero cost (no memory needs to be copied).
334  // question-now: here and in batched_transpose_k, return a contiguous copy instead of original?
335  tensor* transpose_k(tensor* x) {
336      assert_input(x, 2);
337      // int shape_0 = x->shape[0];
338      // x->shape[0] = x->shape[1];
339      // x->shape[1] = shape_0;
340  
341      // int stride_0 = x->stride[0];
342      // x->stride[0] = x->stride[1];
343      // x->stride[1] = stride_0;
344      // return x;
345  
346  
347      int s1 = x->shape[0], s2 = x->shape[1];
348      // [S1, S2] -> [S2, S1]
349      //   - to go to next row, need to skip S2 elements
350      //   - to go to next column, need to skip 1
351      int stride_next_row = s2, stride_next_col = 1;
352  
353      tensor* out = Tensor(s2, s1);
354  
355      // basically need indexing pattern to access elements of array
356      // next idx is different inside-a-row vs when switching to next row
357      //    when within a row -- next idx is "curr_idx + stride_col"
358      //    when switching to new row -- "first idx (of curr row) + stride_row"
359  
360      int idx_orig = 0;
361  
362      // these two loops are only needed to continently provide -- ranges over s2, s1
363      for (int s1_transposed=0; s1_transposed < s2; s1_transposed++){
364          for (int s2_transposed=0; s2_transposed < s1; s2_transposed++){
365  
366              int idx_trans = (s1_transposed * stride_next_col) + (s2_transposed * stride_next_row);
367              // cout << "idx_orig: " << idx_orig
368              out->data[idx_orig++] = x->data[idx_trans];
369              // cout << "; idx_trans: " << idx_trans << endl;
370          }
371      }
372      return out;
373  }
374  
375  
376  // logic for max and batched_max were copied from: reduce_sum and batched_reduce_sum
377  //  todo-high: add some common logic to abstract this repeated stuff away.
378  //    - e.g. lua-torch implements a macro, where you only need to specify body of the for loop (op-specific),
379  //      the rest (common logic) is in the code of the macro itself and thus doens't need to duplciated for each new op
380  tensor* reduce_max_k(tensor* a) {
381      assert_input(a, 2);
382      // set initial minimum to the first element
383      tensor* out = TensorScalarFill(a->data[0]);
384      // store at 0-th location in the scratch space array
385      out->scratch_space[0] = TensorLikeFill(out, 0.);
386      for (int i=0; i<a->size; i++) {
387          if (a->data[i] > out->data[0]){
388              out->data[0] = a->data[i];
389              // will be used in reduce_max_bwd
390              out->scratch_space[0]->data[0] = i;
391          }
392      }
393      return out;
394  }
395  
396  
397  tensor* neg_k(tensor* a) {
398      assert_input(a, -1);
399      tensor* out = TensorLike(a);
400      for (int i=0; i<out->size; i++){
401          out->data[i] = - a->data[i];
402      }
403      return out;
404  }
405  
406  
407  tensor* exp_k(tensor* a) {
408      assert_input(a, -1);
409      tensor* out = TensorLikeFill(a, 0.0);
410      for (int i=0; i<out->size; i++){
411          out->data[i] = expf(a->data[i]);
412      }
413      return out;
414  }
415  
416  
417  tensor* log_k(tensor* a) {
418      assert_input(a, -1);
419      tensor* out = TensorLikeFill(a, 0.0);
420      for (int i=0; i<out->size; i++){
421          out->data[i] = logf(a->data[i]);
422      }
423      return out;
424  }
425  
426  
427  //    batched
428  
429  
430  // a.shape (B, N, M)
431  // b.shape (B, M, D)
432  tensor* batched_matmul_k(tensor* a, tensor* b) {
433      assert_input(a, 3);
434      assert_input(b, 3);
435      if (a->shape[2] != b->shape[1]){
436          printf("[batched_matmul_k] Error: inner dim doesn't match\n");
437          exit(1);
438      }
439  
440      int B = a->shape[0], N = a->shape[1], M = a->shape[2];
441      int D = b->shape[2];
442  
443      tensor* out = Tensor(B, N, D);
444  
445      for (int i=0; i<B; i++){
446          // todo: use tensor_like to preserve strides of original out? Well original curr_out is contiguous anyway
447          // todo: use view_3d instead of manually doing it here, the problem is view_3d does not handle 2d view on 3d tensor (IOW here a dim actually needs to collapse, resulting in a 2d view)
448  
449          // comment: problem is out[i] is float, not Tensor
450          //  curr_out is a scratch throw away tensor, needed bc matmul_k_ expects a Tensor struct for the output argument
451          // todo-now: find a proper fix to the issue above: creating these throw away tensors creates memory leaks -- at least need to free them
452          tensor* curr_out = TensorNoData(N, D);
453          curr_out->data = out->data + (i * out->stride[0]);
454  
455          // comment: a[i], b[i], out[i] is incorrect. Bc such indexing would return i-th element of the tensor, but I don't want the i-th element, I want i-th row! So use 
456          // todo: index_2d?
457  
458          tensor* curr_a = TensorNoData(N, M);
459          curr_a->data = a->data + (i * a->stride[0]);
460  
461          tensor* curr_b = TensorNoData(M, D);
462          curr_b->data = b->data + (i * b->stride[0]);
463  
464          matmul_k_(curr_a, curr_b, curr_out);
465      }
466      return out;
467  }
468  
469  tensor* batched_reduce_sum_k(tensor* a) {
470      assert_input(a, 2);
471  
472      int B = a->shape[0], N = a->shape[1];
473      tensor* out = Tensor(B, 1);
474  
475      for (int b=0; b<B; b++){
476          // a[i], b[i], out[i] is incorrect. Bc such indexing would return
477          // i-th element of the tensor, but I don't want the i-th element,
478          // I want i-th row! So use:
479          tensor* curr_a = TensorNoData(1, N);
480          curr_a->data = a->data + (b * a->stride[0]);
481  
482          tensor* curr_out = reduce_sum_k(curr_a);
483          out->data[b] = curr_out->data[0];
484      }
485      return out;
486  }
487  
488  
489  tensor* batched_reduce_max_k(tensor* a) {
490      assert_input(a, 2);
491  
492      int B = a->shape[0], N = a->shape[1];
493      tensor* out = Tensor(B, 1);
494      out->scratch_space[0] = TensorLikeFill(out, 0.);
495  
496      for (int b=0; b<B; b++){
497          tensor* curr_a = TensorNoData(1, N);
498          curr_a->data = a->data + (b * a->stride[0]);
499  
500          tensor* curr_out = reduce_max_k(curr_a);
501          out->data[b] = curr_out->data[0];
502          out->scratch_space[0]->data[b] = curr_out->scratch_space[0]->data[0];
503      }
504      return out;
505  }
506  
507  // todo-now: for all "batched" kernels can I just reshape input to (B*first_dim), run regular max kernel and then reshape back?
508  
509  // todo-now: to re-use this fwd kernel for CUDA, need to use strides in cuda kernels (instead of indexing manually)
510  // tensor* local_a = transpose_k(b); // (B, M, D) -> (B, D, M)
511  // comment: this is equivalent to numpy's np.transpose(x, axes=(0, 2, 1))
512  tensor* batched_transpose_k(tensor* x){
513      assert_input(x, 3);
514  
515      int shape_1 = x->shape[1];
516      x->shape[1] = x->shape[2];
517      x->shape[2] = shape_1;
518  
519      int stride_1 = x->stride[1];
520      x->stride[1] = x->stride[2];
521      x->stride[2] = stride_1;
522      return x;
523  }