/ tensor.cpp
tensor.cpp
  1  #include "nn.h"
  2  #include "init.cpp"
  3  #include "param.cpp"
  4  #include "autograd.cpp"
  5  #include "gc.cpp"
  6  
  7  // todo-now: assert callback func pointers are not NULL, before calling them
  8  
  9  extern void (*COPY_TO_DEVICE)(tensor*);
 10  extern tensor* (*COPY_FROM_DEVICE)(tensor*);
 11  
 12  
 13  tensor* TensorNoData2d(int y, int z)
 14  {
 15      tensor* t = (tensor*)checkMallocErrors(malloc(sizeof(tensor)));
 16  
 17      t->num_dims = 2;
 18      t->size = y*z;
 19      // todo-low: one line
 20      t->shape[0] = y;
 21      t->shape[1] = z;
 22  
 23      t->stride[0] = z;
 24      t->stride[1] = 1;
 25  
 26      t->data = NULL;
 27  
 28      // todo: put the fields below into an autograd_op struct,
 29      // and store a single pointer to that struct on the tensor
 30      // -- this way more organized
 31  
 32      // for autograd engine:
 33  
 34      // true by default, modified in an op impl if that tensor was produced by an op
 35      t->is_leaf = true;
 36      t->num_inputs = -1;
 37      // unlike num_inputs (which I can't set here in the constructor,
 38      // bc it's unknown at this moment, bc depends on whatever ops is
 39      // being called on that tensor), num_uses is 1nown from the start
 40      // -- it's 0 for all ops
 41      t->num_uses = t->_num_uses = 0;
 42      t->name = random_chars(3);
 43  
 44      t->grad_fn = NULL;
 45      t->grad = NULL;
 46      // note: it makes more sense to set this in ops, because
 47      // there I set all other autograd attributes on tensors.
 48      // But it would be repetitive to set the same attr
 49      // (t->backward) in every op, so set it here
 50      t->backward = backward;
 51  
 52  
 53      // todo-high: here and in the other constructors, set the below fields to NULL?
 54      // t->scratch_space;
 55      // t->inputs;
 56      // t->non_grad_inputs;
 57  
 58      return t;
 59  }
 60  
 61  // todo: add EmptyTensor, EmptyTensorLike, make TensorLikeFill use EmptyTensorLike (instead of TensorLike)
 62  tensor* TensorNoData3d(int x, int y, int z)
 63  {
 64      tensor* t = (tensor*)checkMallocErrors(malloc(sizeof(tensor)));
 65  
 66      t->num_dims = 3;
 67      t->size = x*y*z;
 68      t->shape[0] = x;
 69      t->shape[1] = y;
 70      t->shape[2] = z;
 71  
 72      // todo-high: should express later outer strides in terms of inner strides?
 73      t->stride[0] = y*z;
 74      t->stride[1] = z;
 75      t->stride[2] = 1;
 76  
 77      t->data = NULL;
 78  
 79      t->is_leaf = true;
 80      t->num_inputs = -1;
 81      t->num_uses = t->_num_uses = 0;
 82      t->name = random_chars(3);
 83  
 84      t->grad_fn = NULL;
 85      t->grad = NULL;
 86      t->backward = backward;
 87  
 88      return t;
 89  }
 90  
 91  // todo: minimize duplication (this vs other constructors) this was copy-pasted from TensorNoData3d
 92  tensor* TensorNoData4d(int o, int x, int y, int z)
 93  {
 94      tensor* t = (tensor*)checkMallocErrors(malloc(sizeof(tensor)));
 95  
 96      t->num_dims = 4;
 97      t->size = o*x*y*z;
 98      t->shape[0] = o;
 99      t->shape[1] = x;
100      t->shape[2] = y;
101      t->shape[3] = z;
102  
103      t->stride[0] = x*y*z;
104      t->stride[1] = y*z;
105      t->stride[2] = z;
106      t->stride[3] = 1;
107  
108      t->data = NULL;
109  
110      // for autograd engine:
111      t->is_leaf = true;
112      t->num_inputs = -1;
113      t->num_uses = t->_num_uses = 0;
114      t->name = random_chars(3);
115  
116      t->grad_fn = NULL;
117      t->grad = NULL;
118      t->backward = backward;
119  
120      return t;
121  }
122  
123  // todo: EmptyTensor constructors at the moment do not call COPY_TO_DEVICE
124  
125  // empty means non-initalized, but with data allocated to it
126  tensor* EmptyTensor2d(int s1, int s2)
127  {
128      tensor* t = TensorNoData2d(s1, s2);
129      t->data = (float*)checkMallocErrors(malloc(sizeof(float) * t->size));
130      add_to_gc(t);
131      t->device = CPU;
132      return t;
133  }
134  
135  tensor* EmptyTensor3d(int x, int y, int z)
136  {
137      tensor* t = TensorNoData3d(x, y, z);
138      t->data = (float*)checkMallocErrors(malloc(sizeof(float) * t->size));
139      add_to_gc(t);
140      t->device = CPU;
141      return t;
142  }
143  
144  tensor* EmptyTensor4d(int o, int x, int y, int z)
145  {
146      tensor* t = TensorNoData4d(o, x, y, z);
147      t->data = (float*)checkMallocErrors(malloc(sizeof(float) * t->size));
148      add_to_gc(t);
149      t->device = CPU;
150      return t;
151  }
152  
153  
154  tensor* Tensor2d(int s1, int s2)
155  {
156      tensor* t = EmptyTensor2d(s1, s2);
157      normal_init(t);
158      // todo-low: directly initialize random floats on gpu (avoid initializing on cpu, and then moving)
159      COPY_TO_DEVICE(t);
160      return t;
161  }
162  
163  tensor* Tensor3d(int s1, int s2, int s3)
164  {
165      tensor* t = EmptyTensor3d(s1, s2, s3);
166      normal_init(t);
167      COPY_TO_DEVICE(t);
168      return t;
169  }
170  
171  tensor* Tensor4d(int s1, int s2, int s3, int s4)
172  {
173      tensor* t = EmptyTensor4d(s1, s2, s3, s4);
174      normal_init(t);
175      COPY_TO_DEVICE(t);
176      return t;
177  }
178  
179  
180  /*
181  for convince to avoid:
182      int N = x->shape[0], M = x->shape[1];
183      tensor* out = Tensor2d(N, D);
184  */
185  tensor* TensorLike2d(tensor* t)
186  {
187      int s1 = t->shape[0], s2 = t->shape[1];
188      return Tensor2d(s1, s2);
189  }
190  
191  tensor* TensorLike3d(tensor* t)
192  {
193      int s1 = t->shape[0], s2 = t->shape[1], s3 = t->shape[2];
194      return Tensor3d(s1, s2, s3);
195  }
196  
197  tensor* TensorLike4d(tensor* t)
198  {
199      int s1 = t->shape[0], s2 = t->shape[1], s3 = t->shape[2], s4 = t->shape[3];
200      return Tensor4d(s1, s2, s3, s4);
201  }
202  
203  
204  tensor* TensorLikeNoData2d(tensor* t)
205  {
206      int s1 = t->shape[0], s2 = t->shape[1];
207      return TensorNoData2d(s1, s2);
208  }
209  
210  tensor* TensorLikeNoData3d(tensor* t)
211  {
212      int s1 = t->shape[0], s2 = t->shape[1], s3 = t->shape[2];
213      return TensorNoData3d(s1, s2, s3);
214  }
215  
216  tensor* TensorLikeNoData4d(tensor* t)
217  {
218      int s1 = t->shape[0], s2 = t->shape[1], s3 = t->shape[2], s4 = t->shape[3];
219      return TensorNoData4d(s1, s2, s3, s4);
220  }
221  
222  
223  
224  // todo: in each TensorLikeFill wasteful to init w random value
225  // using GetRandomFloat and then overwrite them anyway
226  tensor* TensorLikeFill2d(tensor* t, float value)
227  {
228      tensor* device_t_new = TensorLike2d(t);
229      tensor* t_new = COPY_FROM_DEVICE(device_t_new);
230      for (int i=0; i<t_new->size; i++)
231          t_new->data[i] = value;
232      // todo-high: all TensorLikeFillNd constructors (and TensorScalarFill) invoke COPY_TO_DEVICE twice -- once here, another time in TensorLike2d -> Tensor2d -> COPY_TO_DEVICE
233      COPY_TO_DEVICE(t_new);
234      return t_new;
235  }
236  
237  tensor* TensorLikeFill3d(tensor* t, float value)
238  {
239      tensor* device_t_new = TensorLike3d(t);
240      tensor* t_new = COPY_FROM_DEVICE(device_t_new);
241      for (int i=0; i<t_new->size; i++)
242          t_new->data[i] = value;
243      COPY_TO_DEVICE(t_new);
244      return t_new;
245  }
246  
247  tensor* TensorLikeFill4d(tensor* t, float value)
248  {
249      tensor* device_t_new = TensorLike4d(t);
250      tensor* t_new = COPY_FROM_DEVICE(device_t_new);
251      for (int i=0; i<t_new->size; i++)
252          t_new->data[i] = value;
253      COPY_TO_DEVICE(t_new);
254      return t_new;
255  }
256  
257  
258  tensor* TensorScalarFill(float value)
259  {
260      // todo: wasteful to init w random value using GetRandomFloat
261      //  and then overwrite them anyway
262      tensor* device_t_new = Tensor2d(1, 1);
263      tensor* t_new = COPY_FROM_DEVICE(device_t_new);
264      // needed bc Tensor initializes to random value
265      t_new->data[0] = value;
266      COPY_TO_DEVICE(t_new);
267      return t_new;
268  }
269  
270  
271  
272  
273  // the below implements function dispatching based on the number of arguments
274  //  abstracts constructors of each family by dispatching to different impls depending on the number of args
275  
276  // need this instead of sizeof based impl, otherwise preprocessor fails
277  #define VA_NARGS_IMPL(_1, _2, _3, _4, _5, N, ...) N
278  #define VA_NARGS(...) VA_NARGS_IMPL(__VA_ARGS__, 5, 4, 3, 2, 1)
279  
280  // need inner otherwise preprocessor fails
281  #define CONCAT(a, b) CONCAT_INNER(a, b)
282  #define CONCAT_INNER(a, b) a##b
283  
284  #define MAX_DIM 4
285  
286  // todo: static_assert(VA_NARGS(__VA_ARGS__) <= MAX_DIM, "[constructor] error")
287  #define Tensor(...) CONCAT(Tensor, CONCAT(VA_NARGS(__VA_ARGS__), d))(__VA_ARGS__)
288  
289  // this preserves device, needed because often in cpu backend, one off tensors are created using TensorNoData,
290  // which are then passed to other kernels_, when these kernels run input checks, they expect to see that tensor
291  // passed to them has t->device=CPU -- so I modified the TensorNoData constructor to preserve the t->device flag
292  #define _TensorNoData(...) CONCAT(TensorNoData, CONCAT(VA_NARGS(__VA_ARGS__), d))(__VA_ARGS__)
293  #define TensorNoData(...) ({tensor* out = _TensorNoData(__VA_ARGS__); out->device=DEVICE; out;})
294  
295  #define EmptyTensor(...) CONCAT(EmptyTensor, CONCAT(VA_NARGS(__VA_ARGS__), d))(__VA_ARGS__)
296  // conv_k_ -> out = EmptyTensor(...) -> (COPY_TO_DEVICE NOT called, so backend isn't set)
297  //  when out of conv_k_ fed to some next kernel (e.g. relu), relu does input checks, expects to see out->device=CPU
298  // NOTE: unlike TensorNoData (which does NOT allocate t->data on any device), for EmptyTensor it's incorrect to set "out->device=DEVICE" -- because EmptyTensor only allocates on cpu at the moment
299  // #define _EmptyTensor(...) CONCAT(EmptyTensor, CONCAT(VA_NARGS(__VA_ARGS__), d))(__VA_ARGS__)
300  // #define EmptyTensor(...) ({tensor* out = _EmptyTensor(__VA_ARGS__); out->device=CPU; out;})
301  
302  /*
303  // question-now: 
304  //  use funcs of this form instead of the macros above?
305  //  This will also allow to call e.g. COPY_TO_DEVICE only once in e.g. Tensor fn, instead of needing to call it multiple times from each impl (e.g. Tensor2d)
306  
307  #include <stdarg.h>
308  
309  #define COUNT_ARGS(...) \
310      (sizeof((int[]){__VA_ARGS__})/sizeof(int))
311  
312  #define Tensor(...) TensorImpl(COUNT_ARGS(__VA_ARGS__), __VA_ARGS__)
313  
314  tensor* TensorImpl(int num_args, ...){
315      va_list args;
316      va_start(args, num_args);
317  
318      int s0 = va_arg(args, int);
319      int s1 = va_arg(args, int);
320  
321      if (num_args == 2){
322          return Tensor2d(s0, s1);
323      } else if (num_args == 3){
324          int s2 = va_arg(args, int);
325          return Tensor3d(s0, s1, s2);
326      } else if (num_args == 4){
327          int s2 = va_arg(args, int);
328          int s3 = va_arg(args, int);
329          return Tensor3d(s0, s1, s2, s3);
330      }
331      va_end(args);
332  }
333  */
334  
335  // comment:
336  //  use functions (instead of macros) for TensorLike, TensorLikeFill, bc preprocessor can't
337  //  evaluate runtime value from a struct member at pre-processing time, so handle it in a fn
338  
339  tensor* TensorLike(tensor* t){
340      if (t->num_dims==2)
341          return TensorLike2d(t);
342      else if (t->num_dims==3)
343          return TensorLike3d(t);
344      else if (t->num_dims==4)
345          return TensorLike4d(t);
346      else
347          exit(1);
348  }
349  
350  // to avoid calling GetRandomFloat (advancing RNG state) in COPY_FROM_DEVICE;
351  // also useful to call when you need a "tensor shell", while the actual t->data comes from somewhere else (already allocated)
352  // -- avoids needing to call: 1) t = Tensor(...); 2) free t->data; 3) t->data = already_allocated
353  tensor* TensorLikeNoData(tensor* t){
354      if (t->num_dims==2)
355          return TensorLikeNoData2d(t);
356      else if (t->num_dims==3)
357          return TensorLikeNoData3d(t);
358      else if (t->num_dims==4)
359          return TensorLikeNoData4d(t);
360      else
361          exit(1);
362  }
363  
364  tensor* TensorLikeFill(tensor* t, float val){
365      if (t->num_dims==2)
366          return TensorLikeFill2d(t, val);
367      else if (t->num_dims==3)
368          return TensorLikeFill3d(t, val);
369      else if (t->num_dims==4)
370          return TensorLikeFill4d(t, val);
371      else
372          exit(1);
373  }