/ tensor.cpp
tensor.cpp
1 #include "nn.h" 2 #include "init.cpp" 3 #include "param.cpp" 4 #include "autograd.cpp" 5 #include "gc.cpp" 6 7 // todo-now: assert callback func pointers are not NULL, before calling them 8 9 extern void (*COPY_TO_DEVICE)(tensor*); 10 extern tensor* (*COPY_FROM_DEVICE)(tensor*); 11 12 13 tensor* TensorNoData2d(int y, int z) 14 { 15 tensor* t = (tensor*)checkMallocErrors(malloc(sizeof(tensor))); 16 17 t->num_dims = 2; 18 t->size = y*z; 19 // todo-low: one line 20 t->shape[0] = y; 21 t->shape[1] = z; 22 23 t->stride[0] = z; 24 t->stride[1] = 1; 25 26 t->data = NULL; 27 28 // todo: put the fields below into an autograd_op struct, 29 // and store a single pointer to that struct on the tensor 30 // -- this way more organized 31 32 // for autograd engine: 33 34 // true by default, modified in an op impl if that tensor was produced by an op 35 t->is_leaf = true; 36 t->num_inputs = -1; 37 // unlike num_inputs (which I can't set here in the constructor, 38 // bc it's unknown at this moment, bc depends on whatever ops is 39 // being called on that tensor), num_uses is 1nown from the start 40 // -- it's 0 for all ops 41 t->num_uses = t->_num_uses = 0; 42 t->name = random_chars(3); 43 44 t->grad_fn = NULL; 45 t->grad = NULL; 46 // note: it makes more sense to set this in ops, because 47 // there I set all other autograd attributes on tensors. 48 // But it would be repetitive to set the same attr 49 // (t->backward) in every op, so set it here 50 t->backward = backward; 51 52 53 // todo-high: here and in the other constructors, set the below fields to NULL? 54 // t->scratch_space; 55 // t->inputs; 56 // t->non_grad_inputs; 57 58 return t; 59 } 60 61 // todo: add EmptyTensor, EmptyTensorLike, make TensorLikeFill use EmptyTensorLike (instead of TensorLike) 62 tensor* TensorNoData3d(int x, int y, int z) 63 { 64 tensor* t = (tensor*)checkMallocErrors(malloc(sizeof(tensor))); 65 66 t->num_dims = 3; 67 t->size = x*y*z; 68 t->shape[0] = x; 69 t->shape[1] = y; 70 t->shape[2] = z; 71 72 // todo-high: should express later outer strides in terms of inner strides? 73 t->stride[0] = y*z; 74 t->stride[1] = z; 75 t->stride[2] = 1; 76 77 t->data = NULL; 78 79 t->is_leaf = true; 80 t->num_inputs = -1; 81 t->num_uses = t->_num_uses = 0; 82 t->name = random_chars(3); 83 84 t->grad_fn = NULL; 85 t->grad = NULL; 86 t->backward = backward; 87 88 return t; 89 } 90 91 // todo: minimize duplication (this vs other constructors) this was copy-pasted from TensorNoData3d 92 tensor* TensorNoData4d(int o, int x, int y, int z) 93 { 94 tensor* t = (tensor*)checkMallocErrors(malloc(sizeof(tensor))); 95 96 t->num_dims = 4; 97 t->size = o*x*y*z; 98 t->shape[0] = o; 99 t->shape[1] = x; 100 t->shape[2] = y; 101 t->shape[3] = z; 102 103 t->stride[0] = x*y*z; 104 t->stride[1] = y*z; 105 t->stride[2] = z; 106 t->stride[3] = 1; 107 108 t->data = NULL; 109 110 // for autograd engine: 111 t->is_leaf = true; 112 t->num_inputs = -1; 113 t->num_uses = t->_num_uses = 0; 114 t->name = random_chars(3); 115 116 t->grad_fn = NULL; 117 t->grad = NULL; 118 t->backward = backward; 119 120 return t; 121 } 122 123 // todo: EmptyTensor constructors at the moment do not call COPY_TO_DEVICE 124 125 // empty means non-initalized, but with data allocated to it 126 tensor* EmptyTensor2d(int s1, int s2) 127 { 128 tensor* t = TensorNoData2d(s1, s2); 129 t->data = (float*)checkMallocErrors(malloc(sizeof(float) * t->size)); 130 add_to_gc(t); 131 t->device = CPU; 132 return t; 133 } 134 135 tensor* EmptyTensor3d(int x, int y, int z) 136 { 137 tensor* t = TensorNoData3d(x, y, z); 138 t->data = (float*)checkMallocErrors(malloc(sizeof(float) * t->size)); 139 add_to_gc(t); 140 t->device = CPU; 141 return t; 142 } 143 144 tensor* EmptyTensor4d(int o, int x, int y, int z) 145 { 146 tensor* t = TensorNoData4d(o, x, y, z); 147 t->data = (float*)checkMallocErrors(malloc(sizeof(float) * t->size)); 148 add_to_gc(t); 149 t->device = CPU; 150 return t; 151 } 152 153 154 tensor* Tensor2d(int s1, int s2) 155 { 156 tensor* t = EmptyTensor2d(s1, s2); 157 normal_init(t); 158 // todo-low: directly initialize random floats on gpu (avoid initializing on cpu, and then moving) 159 COPY_TO_DEVICE(t); 160 return t; 161 } 162 163 tensor* Tensor3d(int s1, int s2, int s3) 164 { 165 tensor* t = EmptyTensor3d(s1, s2, s3); 166 normal_init(t); 167 COPY_TO_DEVICE(t); 168 return t; 169 } 170 171 tensor* Tensor4d(int s1, int s2, int s3, int s4) 172 { 173 tensor* t = EmptyTensor4d(s1, s2, s3, s4); 174 normal_init(t); 175 COPY_TO_DEVICE(t); 176 return t; 177 } 178 179 180 /* 181 for convince to avoid: 182 int N = x->shape[0], M = x->shape[1]; 183 tensor* out = Tensor2d(N, D); 184 */ 185 tensor* TensorLike2d(tensor* t) 186 { 187 int s1 = t->shape[0], s2 = t->shape[1]; 188 return Tensor2d(s1, s2); 189 } 190 191 tensor* TensorLike3d(tensor* t) 192 { 193 int s1 = t->shape[0], s2 = t->shape[1], s3 = t->shape[2]; 194 return Tensor3d(s1, s2, s3); 195 } 196 197 tensor* TensorLike4d(tensor* t) 198 { 199 int s1 = t->shape[0], s2 = t->shape[1], s3 = t->shape[2], s4 = t->shape[3]; 200 return Tensor4d(s1, s2, s3, s4); 201 } 202 203 204 tensor* TensorLikeNoData2d(tensor* t) 205 { 206 int s1 = t->shape[0], s2 = t->shape[1]; 207 return TensorNoData2d(s1, s2); 208 } 209 210 tensor* TensorLikeNoData3d(tensor* t) 211 { 212 int s1 = t->shape[0], s2 = t->shape[1], s3 = t->shape[2]; 213 return TensorNoData3d(s1, s2, s3); 214 } 215 216 tensor* TensorLikeNoData4d(tensor* t) 217 { 218 int s1 = t->shape[0], s2 = t->shape[1], s3 = t->shape[2], s4 = t->shape[3]; 219 return TensorNoData4d(s1, s2, s3, s4); 220 } 221 222 223 224 // todo: in each TensorLikeFill wasteful to init w random value 225 // using GetRandomFloat and then overwrite them anyway 226 tensor* TensorLikeFill2d(tensor* t, float value) 227 { 228 tensor* device_t_new = TensorLike2d(t); 229 tensor* t_new = COPY_FROM_DEVICE(device_t_new); 230 for (int i=0; i<t_new->size; i++) 231 t_new->data[i] = value; 232 // todo-high: all TensorLikeFillNd constructors (and TensorScalarFill) invoke COPY_TO_DEVICE twice -- once here, another time in TensorLike2d -> Tensor2d -> COPY_TO_DEVICE 233 COPY_TO_DEVICE(t_new); 234 return t_new; 235 } 236 237 tensor* TensorLikeFill3d(tensor* t, float value) 238 { 239 tensor* device_t_new = TensorLike3d(t); 240 tensor* t_new = COPY_FROM_DEVICE(device_t_new); 241 for (int i=0; i<t_new->size; i++) 242 t_new->data[i] = value; 243 COPY_TO_DEVICE(t_new); 244 return t_new; 245 } 246 247 tensor* TensorLikeFill4d(tensor* t, float value) 248 { 249 tensor* device_t_new = TensorLike4d(t); 250 tensor* t_new = COPY_FROM_DEVICE(device_t_new); 251 for (int i=0; i<t_new->size; i++) 252 t_new->data[i] = value; 253 COPY_TO_DEVICE(t_new); 254 return t_new; 255 } 256 257 258 tensor* TensorScalarFill(float value) 259 { 260 // todo: wasteful to init w random value using GetRandomFloat 261 // and then overwrite them anyway 262 tensor* device_t_new = Tensor2d(1, 1); 263 tensor* t_new = COPY_FROM_DEVICE(device_t_new); 264 // needed bc Tensor initializes to random value 265 t_new->data[0] = value; 266 COPY_TO_DEVICE(t_new); 267 return t_new; 268 } 269 270 271 272 273 // the below implements function dispatching based on the number of arguments 274 // abstracts constructors of each family by dispatching to different impls depending on the number of args 275 276 // need this instead of sizeof based impl, otherwise preprocessor fails 277 #define VA_NARGS_IMPL(_1, _2, _3, _4, _5, N, ...) N 278 #define VA_NARGS(...) VA_NARGS_IMPL(__VA_ARGS__, 5, 4, 3, 2, 1) 279 280 // need inner otherwise preprocessor fails 281 #define CONCAT(a, b) CONCAT_INNER(a, b) 282 #define CONCAT_INNER(a, b) a##b 283 284 #define MAX_DIM 4 285 286 // todo: static_assert(VA_NARGS(__VA_ARGS__) <= MAX_DIM, "[constructor] error") 287 #define Tensor(...) CONCAT(Tensor, CONCAT(VA_NARGS(__VA_ARGS__), d))(__VA_ARGS__) 288 289 // this preserves device, needed because often in cpu backend, one off tensors are created using TensorNoData, 290 // which are then passed to other kernels_, when these kernels run input checks, they expect to see that tensor 291 // passed to them has t->device=CPU -- so I modified the TensorNoData constructor to preserve the t->device flag 292 #define _TensorNoData(...) CONCAT(TensorNoData, CONCAT(VA_NARGS(__VA_ARGS__), d))(__VA_ARGS__) 293 #define TensorNoData(...) ({tensor* out = _TensorNoData(__VA_ARGS__); out->device=DEVICE; out;}) 294 295 #define EmptyTensor(...) CONCAT(EmptyTensor, CONCAT(VA_NARGS(__VA_ARGS__), d))(__VA_ARGS__) 296 // conv_k_ -> out = EmptyTensor(...) -> (COPY_TO_DEVICE NOT called, so backend isn't set) 297 // when out of conv_k_ fed to some next kernel (e.g. relu), relu does input checks, expects to see out->device=CPU 298 // NOTE: unlike TensorNoData (which does NOT allocate t->data on any device), for EmptyTensor it's incorrect to set "out->device=DEVICE" -- because EmptyTensor only allocates on cpu at the moment 299 // #define _EmptyTensor(...) CONCAT(EmptyTensor, CONCAT(VA_NARGS(__VA_ARGS__), d))(__VA_ARGS__) 300 // #define EmptyTensor(...) ({tensor* out = _EmptyTensor(__VA_ARGS__); out->device=CPU; out;}) 301 302 /* 303 // question-now: 304 // use funcs of this form instead of the macros above? 305 // This will also allow to call e.g. COPY_TO_DEVICE only once in e.g. Tensor fn, instead of needing to call it multiple times from each impl (e.g. Tensor2d) 306 307 #include <stdarg.h> 308 309 #define COUNT_ARGS(...) \ 310 (sizeof((int[]){__VA_ARGS__})/sizeof(int)) 311 312 #define Tensor(...) TensorImpl(COUNT_ARGS(__VA_ARGS__), __VA_ARGS__) 313 314 tensor* TensorImpl(int num_args, ...){ 315 va_list args; 316 va_start(args, num_args); 317 318 int s0 = va_arg(args, int); 319 int s1 = va_arg(args, int); 320 321 if (num_args == 2){ 322 return Tensor2d(s0, s1); 323 } else if (num_args == 3){ 324 int s2 = va_arg(args, int); 325 return Tensor3d(s0, s1, s2); 326 } else if (num_args == 4){ 327 int s2 = va_arg(args, int); 328 int s3 = va_arg(args, int); 329 return Tensor3d(s0, s1, s2, s3); 330 } 331 va_end(args); 332 } 333 */ 334 335 // comment: 336 // use functions (instead of macros) for TensorLike, TensorLikeFill, bc preprocessor can't 337 // evaluate runtime value from a struct member at pre-processing time, so handle it in a fn 338 339 tensor* TensorLike(tensor* t){ 340 if (t->num_dims==2) 341 return TensorLike2d(t); 342 else if (t->num_dims==3) 343 return TensorLike3d(t); 344 else if (t->num_dims==4) 345 return TensorLike4d(t); 346 else 347 exit(1); 348 } 349 350 // to avoid calling GetRandomFloat (advancing RNG state) in COPY_FROM_DEVICE; 351 // also useful to call when you need a "tensor shell", while the actual t->data comes from somewhere else (already allocated) 352 // -- avoids needing to call: 1) t = Tensor(...); 2) free t->data; 3) t->data = already_allocated 353 tensor* TensorLikeNoData(tensor* t){ 354 if (t->num_dims==2) 355 return TensorLikeNoData2d(t); 356 else if (t->num_dims==3) 357 return TensorLikeNoData3d(t); 358 else if (t->num_dims==4) 359 return TensorLikeNoData4d(t); 360 else 361 exit(1); 362 } 363 364 tensor* TensorLikeFill(tensor* t, float val){ 365 if (t->num_dims==2) 366 return TensorLikeFill2d(t, val); 367 else if (t->num_dims==3) 368 return TensorLikeFill3d(t, val); 369 else if (t->num_dims==4) 370 return TensorLikeFill4d(t, val); 371 else 372 exit(1); 373 }