/ ops.cpp
ops.cpp
  1  #include "nn.h"
  2  #include "indexing.cpp"
  3  #include "utils.cpp"
  4  #include "backends/select_backend.cpp"
  5  
  6  // **** operations ****
  7  //   - shared between CUDA and CPU kernels
  8  //   - user-facing, unlike other abstractions
  9  //   - call primitives, in addition, record data for the autograd:
 10  //   - allocating grad buffers
 11  //   - write local derivatives, for each input
 12  
 13  
 14  // todo-now: temporarily commented out ops which don't yet have CUDA impl, so that compilation doesn't fail bc linker can't find _k expected by these ops
 15  
 16  tensor* add(tensor* a, tensor* b) {
 17      a->num_uses++;
 18      b->num_uses++;
 19      tensor* t = add_k(a, b);
 20      t->is_leaf = false;
 21      // todo-low: check bool tensor->requires_grad before doing steps below, including allocating buffers
 22  
 23      // todo: this can be further abstracted -- creating a binary_op function
 24      // todo: and even further abstracted -- creating a binary_elementwise_op function
 25  
 26      // fill the additional info on out tensor
 27      t->num_inputs = 2;
 28      t->inputs[0] = a;
 29      t->inputs[1] = b;
 30      t->op_type = 0;
 31      t->grad_fn = add_bwd;
 32      return t;
 33  }
 34  
 35  tensor* sub(tensor* a, tensor* b) {
 36      a->num_uses++;
 37      b->num_uses++;
 38      tensor* t = sub_k(a, b);
 39      t->is_leaf = false;
 40      t->num_inputs = 2;
 41      t->inputs[0] = a;
 42      t->inputs[1] = b;
 43      t->op_type = 1;
 44      t->grad_fn = sub_bwd;
 45      return t;
 46  }
 47  
 48  tensor* mul(tensor* a, tensor* b) {
 49      a->num_uses++;
 50      b->num_uses++;
 51      tensor* t = mul_k(a, b);
 52      t->is_leaf = false;
 53      t->num_inputs = 2;
 54      t->inputs[0] = a;
 55      t->inputs[1] = b;
 56      t->op_type = 2;
 57      t->grad_fn = mul_bwd;
 58      return t;
 59  }
 60  
 61  tensor* matmul(tensor* a, tensor* b) {
 62      a->num_uses++;
 63      b->num_uses++;
 64      // e.g.: a(N, M) @ b(M, D) = t(N, D)
 65      tensor* t = matmul_k(a, b);
 66      t->is_leaf = false;
 67      t->num_inputs = 2;
 68      t->inputs[0] = a;
 69      t->inputs[1] = b;
 70      t->op_type = 3;
 71      t->grad_fn = matmul_bwd;
 72      return t;
 73  }
 74  
 75  tensor* div(tensor* a, tensor* b) {
 76      a->num_uses++;
 77      b->num_uses++;
 78      tensor* t = div_k(a, b);
 79      t->is_leaf = false;
 80      t->num_inputs = 2;
 81      t->inputs[0] = a;
 82      t->inputs[1] = b;
 83      t->op_type = 20;
 84      t->grad_fn = div_bwd;
 85      return t;
 86  }
 87  
 88  tensor* repeat(tensor* a, int axis, int num_repeats) {
 89      a->num_uses++;
 90      tensor* t = repeat_k(a, axis, num_repeats);
 91      t->is_leaf = false;
 92      t->num_inputs = 1;
 93      t->inputs[0] = a;
 94      t->non_grad_inputs[0] = axis;
 95      t->non_grad_inputs[1] = num_repeats;
 96      t->op_type = 18;
 97      t->grad_fn = repeat_bwd;
 98      return t;
 99  }
100  
101  tensor* select(tensor* a, tensor* idx) {
102      a->num_uses++;
103      idx->num_uses++;
104      tensor* t = select_k(a, idx);
105      t->is_leaf = false;
106      t->num_inputs = 2;
107      t->inputs[0] = a;
108      t->inputs[1] = idx;
109      t->op_type = 14;
110      t->grad_fn = select_bwd;
111      return t;
112  }
113  
114  tensor* pow(tensor* a, int exponent) {
115      a->num_uses++;
116      tensor* t = pow_k(a, exponent);
117      t->is_leaf = false;
118      //  comment: note by "inputs" I mean tensor inputs (INPUTS which I'll use compute grads wrt to)
119      //  so here even if this op has two inputs, it really has one, for the purpose of the autograd
120      t->num_inputs = 1;
121      t->inputs[0] = a;
122      t->non_grad_inputs[0] = exponent;
123      t->op_type = 4;
124      t->grad_fn = pow_bwd;
125      return t;
126  }
127  
128  tensor* relu(tensor* a) {
129      a->num_uses++;
130      tensor* t = relu_k(a);
131      t->is_leaf = false;
132      t->num_inputs = 1;
133      t->inputs[0] = a;
134      t->op_type = 6;
135      t->grad_fn = relu_bwd;
136      return t;
137  }
138  
139  tensor* transpose(tensor* a) {
140      a->num_uses++;
141      tensor* t = transpose_k(a);
142      t->is_leaf = false;
143      t->num_inputs = 1;
144      t->inputs[0] = a;
145      t->op_type = 7;
146      t->grad_fn = transpose_bwd;
147      return t;
148  }
149  
150  tensor* neg(tensor* a) {
151      a->num_uses++;
152      tensor* t = neg_k(a);
153      t->is_leaf = false;
154      t->num_inputs = 1;
155      t->inputs[0] = a;
156      t->op_type = 19;
157      t->grad_fn = neg_bwd;
158      return t;
159  }
160  
161  // todo: does it conflict with C's math.exp ?
162  tensor* exp(tensor* a) {
163      a->num_uses++;
164      tensor* t = exp_k(a);
165      t->is_leaf = false;
166      t->num_inputs = 1;
167      t->inputs[0] = a;
168      t->op_type = 16;
169      t->grad_fn = exp_bwd;
170      return t;
171  }
172  
173  tensor* log(tensor* a) {
174      a->num_uses++;
175      tensor* t = log_k(a);
176      t->is_leaf = false;
177      t->num_inputs = 1;
178      t->inputs[0] = a;
179      t->op_type = 15;
180      t->grad_fn = log_bwd;
181      return t;
182  }
183  
184  tensor* batched_matmul(tensor* a, tensor* b) {
185      a->num_uses++;
186      b->num_uses++;
187      // e.g.: a(B, N, M) @ b(B, M, D) = t(B, N, D)
188      tensor* t = batched_matmul_k(a, b);
189      t->is_leaf = false;
190      t->num_inputs = 2;
191      t->inputs[0] = a;
192      t->inputs[1] = b;
193      t->op_type = 8;
194      t->grad_fn = batched_matmul_bwd;
195      return t;
196  }
197  
198  tensor* batched_flatten(tensor* a) {
199      a->num_uses++;
200      tensor* t = batched_flatten_k(a);
201      t->is_leaf = false;
202      t->num_inputs = 1;
203      t->inputs[0] = a;
204      t->op_type = 13;
205      t->grad_fn = batched_flatten_bwd;
206      return t;
207  }
208  
209  tensor* reduce_sum(tensor* a) {
210      a->num_uses++;
211      tensor* t = reduce_sum_k(a);
212      t->is_leaf = false;
213      t->num_inputs = 1;
214      t->inputs[0] = a;
215      t->op_type = 5;
216      t->grad_fn = reduce_sum_bwd;
217      return t;
218  }
219  
220  tensor* batched_reduce_sum(tensor* a) {
221      a->num_uses++;
222      tensor* t = batched_reduce_sum_k(a);
223      t->is_leaf = false;
224      t->num_inputs = 1;
225      t->inputs[0] = a;
226      t->op_type = 17;
227      t->grad_fn = batched_reduce_sum_bwd;
228      return t;
229  }
230  
231  tensor* reduce_max(tensor* a) {
232      a->num_uses++;
233      tensor* t = reduce_max_k(a);
234      t->is_leaf = false;
235      t->num_inputs = 1;
236      t->inputs[0] = a;
237      t->op_type = 21;
238      t->grad_fn = reduce_max_bwd;
239      return t;
240  }
241  
242  tensor* batched_reduce_max(tensor* a) {
243      a->num_uses++;
244      tensor* t = batched_reduce_max_k(a);
245      t->is_leaf = false;
246      t->num_inputs = 1;
247      t->inputs[0] = a;
248      t->op_type = 22;
249      t->grad_fn = batched_reduce_max_bwd;
250      return t;
251  }
252  
253  tensor* conv(tensor* input, tensor* kernel, tensor* bias) {
254      input->num_uses++;
255      kernel->num_uses++;
256      bias->num_uses++;
257      tensor* t = conv_k(input, kernel, bias);
258      t->is_leaf = false;
259      t->num_inputs = 3;
260      t->inputs[0] = input;
261      t->inputs[1] = kernel;
262      t->inputs[2] = bias;
263      t->op_type = 9;
264      t->grad_fn = bwd_conv_k;
265      return t;
266  }
267  
268  tensor* batched_conv(tensor* input, tensor* kernel, tensor* bias) {
269      input->num_uses++;
270      kernel->num_uses++;
271      bias->num_uses++;
272      tensor* t = batched_conv_k(input, kernel, bias);
273      t->is_leaf = false;
274      t->num_inputs = 3;
275      t->inputs[0] = input;
276      t->inputs[1] = kernel;
277      t->inputs[2] = bias;
278      t->op_type = 10;
279      t->grad_fn = bwd_batched_conv_k;
280      return t;
281  }
282  
283  tensor* maxpool(tensor* input) {
284      input->num_uses++;
285      tensor* t = maxpool_k(input);
286      t->is_leaf = false;
287      t->num_inputs = 1;
288      t->inputs[0] = input;
289      t->op_type = 11;
290      t->grad_fn = bwd_maxpool_k;
291      return t;
292  }
293  
294  tensor* batched_maxpool(tensor* input) {
295      input->num_uses++;
296      tensor* t = batched_maxpool_k(input);
297      t->is_leaf = false;
298      t->num_inputs = 1;
299      t->inputs[0] = input;
300      t->op_type = 12;
301      t->grad_fn = bwd_batched_maxpool_k;
302      return t;
303  }