kernels.cpp
1 #include <math.h> // log, pow 2 3 4 tensor* neg_k(tensor*); 5 tensor* pow_k(tensor*, int); 6 tensor* transpose_k(tensor*); 7 tensor* mul_k(tensor*, tensor*); 8 tensor* batched_transpose_k(tensor*); 9 tensor* batched_reduce_sum_k(tensor*); 10 11 12 // env though this fn is called assert_device, it's checks for cpu -- this naming is to keep parity between the two backends (to reuse common/asserts) 13 void assert_device(tensor* a){ 14 if (a->device!=CPU){ 15 printf("[assert_device] Error: expected device cpu. Saw: %i\n", a->device); 16 exit(1); 17 } 18 } 19 20 21 // todo: 22 // - to preserve old behavior, change ops to: 23 // - index from the right (inner most dims), NOT from the left (outer most) 24 // - loop over the outer-most dim as well 25 // - IOW, in nn.h when increase shape from 3 to 4, all the existing ops (indexing from left) might break? 26 // - On the other hand, the kernels are specialized to particular num dims -- so will break anyway 27 28 // note: naming convention: assume "_elementwise", by default -- and ask to specify otherwise (if not elementwise) -- e.g. reduce_, matrix_, etc 29 // todo: ? use capital letters for functions that allocate heap mem, use lowercase otherwise 30 // note: below derivatives try to follow this order -- local * upstream 31 32 // **** kernels ***** 33 // - operate on tensors 34 // - allocate ouput buff 35 // - return new tensor 36 37 38 // binary elementwise 39 40 41 42 tensor* add_k_(tensor* a, tensor* b, tensor* out) { 43 assert_binary_elementwise_non_contiguous(a, b); 44 45 // Previously was hardcoding a at for a specific index here -- conv_bwd uses add (which calls add_k_) for both 3d tensors (grads wrt input) AND 4d tensors (grads wrt kernels) 46 // todo: here and in all ops, assert same size 47 for (int i=0; i<out->size; i++){ 48 // comment: note incrementing index like this in a loop, assumes contiguous data 49 // out->data[i] = a->data[i] + b->data[i]; 50 out->data[at(out, i)] = a->data[at(a, i)] + b->data[at(b, i)]; 51 } 52 return out; 53 } 54 55 tensor* unsafe_add_k_(tensor* a, tensor* b, tensor* out) { 56 assert_device(a); 57 assert_device(b); 58 assert_same_size(a, b); 59 for (int i=0; i<out->size; i++){ 60 // comment: removed at, becuase at only supports 2d and 3d, but from batched_flatten_k I'm passing a 4d tensor 61 // out->data[at(out, i)] = a->data[at(a, i)] + b->data[at(b, i)]; 62 out->data[i] = a->data[i] + b->data[i]; 63 } 64 return out; 65 } 66 67 68 // todo: use "const tensor*" instead of "tensor" 69 tensor* add_k(tensor* a, tensor* b) { 70 tensor* out = TensorLike(a); 71 return add_k_(a, b, out); 72 } 73 74 tensor* sub_k_(tensor* a, tensor* b, tensor* out) { 75 assert_binary_elementwise(a, b); 76 for (int i=0; i<out->size; i++){ 77 out->data[i] = a->data[i] - b->data[i]; 78 } 79 return out; 80 } 81 82 tensor* sub_k(tensor* a, tensor* b) { 83 tensor* out = TensorLike(a); 84 return sub_k_(a, b, out); 85 } 86 87 tensor* mul_k_(tensor* a, tensor* b, tensor* out) { 88 assert_binary_elementwise(a, b); 89 for (int i=0; i<out->size; i++){ 90 out->data[i] = a->data[i] * b->data[i]; 91 } 92 return out; 93 } 94 95 tensor* mul_k(tensor* a, tensor* b) { 96 tensor* out = TensorLike(a); 97 return mul_k_(a, b, out); 98 } 99 100 101 // binary 102 103 104 // a.shape (N, M) 105 // b.shape (M, D) 106 // out.shape (N, D) 107 void matmul_k_(tensor* a, tensor* b, tensor* out) { 108 // supports non-contiguous 109 assert_device(a); 110 assert_device(b); 111 assert_dim(a, 2); 112 assert_dim(b, 2); 113 if (a->shape[1] != b->shape[0]){ 114 printf("[matmul_k_] Error: inner dim doesn't match, saw: a(%i, %i) b(%i, %i)\n", a->shape[0], a->shape[1], b->shape[0], b->shape[1]); 115 exit(1); 116 } 117 118 // todo: add assert that num dims in inputs == 2 119 120 int N = a->shape[0], M = a->shape[1]; 121 int D = b->shape[1]; 122 123 // (N, M) @ (M, D) = (N, D) 124 for (int n=0; n<N; n++){ 125 for (int d=0; d<D; d++){ 126 float sum = 0; 127 for (int m=0; m<M; m++){ 128 // n*M, because for each "n" you skip "M" values 129 130 // uses index_ instead of "a->data[n * M + m * 1]" as to not assume strides 131 // printf("index_2d(a, n, m): %i\n", index_2d(a, n, m)); 132 sum += a->data[index_2d(a, n, m)] * b->data[index_2d(b, m, d)]; 133 } 134 out->data[index_2d(out, n, d)] = sum; 135 } 136 } 137 } 138 139 // a.shape (N, M) 140 // b.shape (M, D) 141 // out.shape (N, D) 142 tensor* matmul_k(tensor* a, tensor* b) { 143 int N = a->shape[0], D = b->shape[1]; 144 tensor* out = Tensor(N, D); 145 matmul_k_(a, b, out); 146 return out; 147 } 148 149 150 tensor* div_k(tensor* a, tensor* b) { 151 assert_binary_elementwise(a, b); 152 tensor* out = TensorLike(a); 153 for (int i=0; i<out->size; i++){ 154 out->data[i] = a->data[i] / b->data[i]; 155 } 156 return out; 157 } 158 159 160 tensor* repeat_k(tensor* a, int axis, int num_repeats) { 161 assert_input(a, 2); 162 if (axis != 0 && axis != 1){ 163 printf("[repeat] Unexpected axis\n"); 164 exit(1); 165 } 166 if (a->shape[axis] != 1){ 167 printf("[repeat] Shape error\n"); 168 exit(1); 169 } 170 171 tensor* out; 172 173 // a.shape (1, N) 174 if (axis==0){ 175 int N = a->shape[1]; 176 out = Tensor(num_repeats, N); 177 178 for (int b=0; b<num_repeats; b++){ 179 // points to the first element of the current b 180 float* curr_out = out->data + (b * out->stride[0]); 181 for (int i=0; i<N; i++){ 182 // for curr_a multiplying by "a->stride[0]" is not needed bc curr_a is (1, N) 183 *(curr_out+i) = *(a->data+i); 184 } 185 } 186 187 // a.shape (B, 1) 188 } else if (axis==1){ 189 int B = a->shape[0]; 190 out = Tensor(B, num_repeats); 191 192 // 1 1 1 193 // 2 2 2 194 // 3 ... 195 for (int b=0; b<B; b++){ 196 // points to the first element of the current b 197 float* curr_a = a->data + b; 198 // here indexing includes multiplying by "out->stride[0]" bc 199 // out is a 2d tensor (for curr_a multiplying by "a->stride[0]" is 200 // not needed bc curr_a is (B, 1)) 201 float* curr_out = out->data + (b * out->stride[0]); 202 for (int i=0; i<num_repeats; i++){ 203 *(curr_out+i) = *(curr_a); 204 } 205 } 206 } 207 return out; 208 } 209 210 211 tensor* select_k(tensor* a, tensor* idx) { 212 // expect shapes: 213 // a(s1, s2) 214 // idx(s1, 1) 215 // also each element of idx's first dim should be in range 0-s2 216 // (bc the first dim in idx will be used to index the 2nd dim of a) 217 // this latter condition is not being checked here 218 assert_input(a, 2); 219 assert_input(idx, 2); 220 if (idx->shape[1]!=1 || idx->shape[0]!=a->shape[0]) { 221 printf("[select] Error shape"); 222 exit(1); 223 } 224 225 int B = a->shape[0]; 226 tensor* out = Tensor(B, 1); 227 228 for (int b=0; b<B; b++){ 229 float* curr_a = a->data + (a->stride[0]*b); 230 // note: additional constraint on input: idx->data should be ints; 231 // below: 232 // 1) take pointer to the *first* element of the current batch example in a (curr_a) 233 // 2) add it idx for the current batch example (currently all tensors are floats, so need to cast idx->data to int before doing it) 234 // 3) the result of the above is a pointer, so de-reference it 235 out->data[b] = *(curr_a + (int)idx->data[b]); 236 // printf("curr_idx: %i", (int)idx->data[b]); 237 } 238 return out; 239 } 240 241 242 void select_set_(tensor* a, tensor* idx, int value) { 243 assert_input(a, 2); 244 assert_input(idx, 2); 245 if (idx->shape[1]!=1 || idx->shape[0]!=a->shape[0]) { 246 printf("[select_set_] Error shape"); 247 exit(1); 248 } 249 int B = a->shape[0]; 250 for (int b=0; b<B; b++){ 251 float* curr_a = a->data + (a->stride[0]*b); 252 *(curr_a + (int)idx->data[b]) = value; 253 } 254 } 255 256 257 // unary 258 259 tensor* sqrt_k(tensor* a) { 260 assert_input(a, -1); 261 tensor* out = TensorLike(a); 262 for (int i=0; i<out->size; i++){ 263 out->data[i] = (float)sqrt(a->data[i]); 264 } 265 return out; 266 } 267 268 tensor* pow_k(tensor* a, int exponent) { 269 // supports n-dim input but must be contiguous 270 // (because of the direct access to its data below) 271 assert_input(a, -1); 272 273 tensor* out = TensorLikeFill(a, 0.0); 274 for (int i=0; i<out->size; i++){ 275 // pow here refers to C's math function, not tiny-torch's op 276 out->data[i] = (float)pow(a->data[i], exponent); 277 } 278 return out; 279 } 280 281 282 tensor* reduce_sum_k(tensor* a) { 283 assert_input(a, -1); 284 285 // reduce to scalar 286 tensor* out = TensorScalarFill(0.0); 287 for (int i=0; i<a->size; i++){ 288 out->data[0] += a->data[i]; 289 } 290 return out; 291 } 292 293 294 tensor* relu_k(tensor* a) { 295 assert_input(a, -1); 296 297 tensor* out = TensorLike(a); 298 // todo: this kernel is more complicated to record idxs into 299 // scratch space, bc don't know size of this tensor in advance (data-dependant size) 300 // tensor* scratch_space = TensorLikeFill(out, 0.); 301 for (int i=0; i<out->size; i++){ 302 if (a->data[i] < 0.0){ 303 out->data[i] = 0.0; 304 // scratch_space->data[] 305 } else { 306 out->data[i] = a->data[i]; 307 } 308 } 309 // out->scratch_space[0] = scratch_space; 310 return out; 311 } 312 313 void relu_bwd(tensor* upstream, tensor* out) { 314 assert_input(upstream, out->num_dims); 315 tensor* a = out->inputs[0]; 316 // local 317 // todo: avoid re-computing 318 // this is kind of gradient checkpointing; 319 // ofc can avoid by making original relu ouput the mask 320 // -- but I found recomputing is a bit cleaner to explain 321 322 tensor* local = TensorLike(a); 323 for (int i=0; i<local->size; i++) { 324 local->data[i] = a->data[i] > 0.0 ? 1.0 : 0.0; 325 } 326 327 a->grad = mul_k(local, upstream); 328 } 329 330 331 // todo-high: 332 // By modifying strides, for example, an array can be transposed 333 // or reshaped at zero cost (no memory needs to be copied). 334 // question-now: here and in batched_transpose_k, return a contiguous copy instead of original? 335 tensor* transpose_k(tensor* x) { 336 assert_input(x, 2); 337 // int shape_0 = x->shape[0]; 338 // x->shape[0] = x->shape[1]; 339 // x->shape[1] = shape_0; 340 341 // int stride_0 = x->stride[0]; 342 // x->stride[0] = x->stride[1]; 343 // x->stride[1] = stride_0; 344 // return x; 345 346 347 int s1 = x->shape[0], s2 = x->shape[1]; 348 // [S1, S2] -> [S2, S1] 349 // - to go to next row, need to skip S2 elements 350 // - to go to next column, need to skip 1 351 int stride_next_row = s2, stride_next_col = 1; 352 353 tensor* out = Tensor(s2, s1); 354 355 // basically need indexing pattern to access elements of array 356 // next idx is different inside-a-row vs when switching to next row 357 // when within a row -- next idx is "curr_idx + stride_col" 358 // when switching to new row -- "first idx (of curr row) + stride_row" 359 360 int idx_orig = 0; 361 362 // these two loops are only needed to continently provide -- ranges over s2, s1 363 for (int s1_transposed=0; s1_transposed < s2; s1_transposed++){ 364 for (int s2_transposed=0; s2_transposed < s1; s2_transposed++){ 365 366 int idx_trans = (s1_transposed * stride_next_col) + (s2_transposed * stride_next_row); 367 // cout << "idx_orig: " << idx_orig 368 out->data[idx_orig++] = x->data[idx_trans]; 369 // cout << "; idx_trans: " << idx_trans << endl; 370 } 371 } 372 return out; 373 } 374 375 376 // logic for max and batched_max were copied from: reduce_sum and batched_reduce_sum 377 // todo-high: add some common logic to abstract this repeated stuff away. 378 // - e.g. lua-torch implements a macro, where you only need to specify body of the for loop (op-specific), 379 // the rest (common logic) is in the code of the macro itself and thus doens't need to duplciated for each new op 380 tensor* reduce_max_k(tensor* a) { 381 assert_input(a, 2); 382 // set initial minimum to the first element 383 tensor* out = TensorScalarFill(a->data[0]); 384 // store at 0-th location in the scratch space array 385 out->scratch_space[0] = TensorLikeFill(out, 0.); 386 for (int i=0; i<a->size; i++) { 387 if (a->data[i] > out->data[0]){ 388 out->data[0] = a->data[i]; 389 // will be used in reduce_max_bwd 390 out->scratch_space[0]->data[0] = i; 391 } 392 } 393 return out; 394 } 395 396 397 tensor* neg_k(tensor* a) { 398 assert_input(a, -1); 399 tensor* out = TensorLike(a); 400 for (int i=0; i<out->size; i++){ 401 out->data[i] = - a->data[i]; 402 } 403 return out; 404 } 405 406 407 tensor* exp_k(tensor* a) { 408 assert_input(a, -1); 409 tensor* out = TensorLikeFill(a, 0.0); 410 for (int i=0; i<out->size; i++){ 411 out->data[i] = expf(a->data[i]); 412 } 413 return out; 414 } 415 416 417 tensor* log_k(tensor* a) { 418 assert_input(a, -1); 419 tensor* out = TensorLikeFill(a, 0.0); 420 for (int i=0; i<out->size; i++){ 421 out->data[i] = logf(a->data[i]); 422 } 423 return out; 424 } 425 426 427 // batched 428 429 430 // a.shape (B, N, M) 431 // b.shape (B, M, D) 432 tensor* batched_matmul_k(tensor* a, tensor* b) { 433 assert_input(a, 3); 434 assert_input(b, 3); 435 if (a->shape[2] != b->shape[1]){ 436 printf("[batched_matmul_k] Error: inner dim doesn't match\n"); 437 exit(1); 438 } 439 440 int B = a->shape[0], N = a->shape[1], M = a->shape[2]; 441 int D = b->shape[2]; 442 443 tensor* out = Tensor(B, N, D); 444 445 for (int i=0; i<B; i++){ 446 // todo: use tensor_like to preserve strides of original out? Well original curr_out is contiguous anyway 447 // todo: use view_3d instead of manually doing it here, the problem is view_3d does not handle 2d view on 3d tensor (IOW here a dim actually needs to collapse, resulting in a 2d view) 448 449 // comment: problem is out[i] is float, not Tensor 450 // curr_out is a scratch throw away tensor, needed bc matmul_k_ expects a Tensor struct for the output argument 451 // todo-now: find a proper fix to the issue above: creating these throw away tensors creates memory leaks -- at least need to free them 452 tensor* curr_out = TensorNoData(N, D); 453 curr_out->data = out->data + (i * out->stride[0]); 454 455 // comment: a[i], b[i], out[i] is incorrect. Bc such indexing would return i-th element of the tensor, but I don't want the i-th element, I want i-th row! So use 456 // todo: index_2d? 457 458 tensor* curr_a = TensorNoData(N, M); 459 curr_a->data = a->data + (i * a->stride[0]); 460 461 tensor* curr_b = TensorNoData(M, D); 462 curr_b->data = b->data + (i * b->stride[0]); 463 464 matmul_k_(curr_a, curr_b, curr_out); 465 } 466 return out; 467 } 468 469 tensor* batched_reduce_sum_k(tensor* a) { 470 assert_input(a, 2); 471 472 int B = a->shape[0], N = a->shape[1]; 473 tensor* out = Tensor(B, 1); 474 475 for (int b=0; b<B; b++){ 476 // a[i], b[i], out[i] is incorrect. Bc such indexing would return 477 // i-th element of the tensor, but I don't want the i-th element, 478 // I want i-th row! So use: 479 tensor* curr_a = TensorNoData(1, N); 480 curr_a->data = a->data + (b * a->stride[0]); 481 482 tensor* curr_out = reduce_sum_k(curr_a); 483 out->data[b] = curr_out->data[0]; 484 } 485 return out; 486 } 487 488 489 tensor* batched_reduce_max_k(tensor* a) { 490 assert_input(a, 2); 491 492 int B = a->shape[0], N = a->shape[1]; 493 tensor* out = Tensor(B, 1); 494 out->scratch_space[0] = TensorLikeFill(out, 0.); 495 496 for (int b=0; b<B; b++){ 497 tensor* curr_a = TensorNoData(1, N); 498 curr_a->data = a->data + (b * a->stride[0]); 499 500 tensor* curr_out = reduce_max_k(curr_a); 501 out->data[b] = curr_out->data[0]; 502 out->scratch_space[0]->data[b] = curr_out->scratch_space[0]->data[0]; 503 } 504 return out; 505 } 506 507 // todo-now: for all "batched" kernels can I just reshape input to (B*first_dim), run regular max kernel and then reshape back? 508 509 // todo-now: to re-use this fwd kernel for CUDA, need to use strides in cuda kernels (instead of indexing manually) 510 // tensor* local_a = transpose_k(b); // (B, M, D) -> (B, D, M) 511 // comment: this is equivalent to numpy's np.transpose(x, axes=(0, 2, 1)) 512 tensor* batched_transpose_k(tensor* x){ 513 assert_input(x, 3); 514 515 int shape_1 = x->shape[1]; 516 x->shape[1] = x->shape[2]; 517 x->shape[2] = shape_1; 518 519 int stride_1 = x->stride[1]; 520 x->stride[1] = x->stride[2]; 521 x->stride[2] = stride_1; 522 return x; 523 }