/ optim.cpp
optim.cpp
1 2 void adam(float learning_rate) { 3 param* temp = param_head; 4 5 while (temp){ 6 tensor* w = temp->value; 7 8 int t = temp->t; 9 tensor* beta1 = TensorLikeFill(w, temp->beta1); 10 tensor* beta2 = TensorLikeFill(w, temp->beta2); 11 tensor* epsilon = TensorLikeFill(w, temp->epsilon); 12 tensor* first_moment = temp->first_moment; 13 tensor* second_moment = temp->second_moment; 14 15 t++; 16 17 // first_moment = beta1 * first_moment + (1 - beta1) * dw 18 add_k_( 19 mul_k(beta1, first_moment), 20 mul_k(sub(TensorLikeFill(beta1, 1.), beta1), w->grad), 21 first_moment 22 ); 23 24 // second_moment = beta2 * second_moment + (1 - beta2) * dw**2 25 add_k_( 26 mul_k(beta2, second_moment), 27 mul_k(sub_k(TensorLikeFill(beta2, 1.), beta2), pow_k(w->grad, 2)), 28 second_moment 29 ); 30 31 // first_unbias = first_moment / (1 - beta1**t) 32 tensor* first_unbias = div_k( 33 first_moment, 34 sub_k( 35 TensorLikeFill(beta1, 1.), 36 pow_k(beta1, t) 37 ) 38 ); 39 40 // second_unbias = second_moment / (1 - beta2**t) 41 tensor* second_unbias = div_k( 42 second_moment, 43 sub_k( 44 TensorLikeFill(beta2, 1.), 45 pow_k(beta2, t) 46 ) 47 ); 48 49 // w -= lr * first_unbias / (np.sqrt(second_unbias) + epsilon) 50 sub_k_( 51 w, 52 div_k( 53 mul_k( 54 TensorLikeFill(first_unbias, learning_rate), 55 first_unbias 56 ), 57 add_k( 58 sqrt_k(second_unbias), 59 epsilon 60 ) 61 ), 62 w 63 ); 64 65 temp = temp->next; 66 } 67 } 68 69 70 // todo: mv to composite_ops 71 void sgd(float learning_rate, float momentum) { 72 // // param_head is a global variable 73 // extern param* param_head; 74 param* temp = param_head; 75 // iterate over the linked list of params 76 while (temp){ 77 // printf("[sgd] %s\n", temp->value->name); 78 tensor* w = temp->value; 79 80 if (momentum == 0.){ 81 // more convenient way (than the approach below) is to not 82 // even create out tensor (new weight) in the first place 83 sub_k_(w, mul_k(w->grad, TensorLikeFill(w->grad, learning_rate)), w); 84 85 // // assign to the "w->data" because I want the weight to be persistent -- 86 // // my first N tensors (weights, dataset) in the GC list are never removed from the GC; 87 // // but if I were to just replace the entire old weight tensor with the new weight tensor, 88 // // the new weight tensor would be cleaned up next epoch (bc it's in the end of the 89 // // GC list -- not one of the first N elements) 90 // cudaFree(w->data); 91 // tensor* new_w = sub_k(w, mul_k(w->grad, TensorLikeFill(w->grad, learning_rate))); 92 // GC_IDX--; // don't track "new_w" 93 // w->data = new_w->data; 94 } else { 95 /* 96 https://github.com/keras-team/keras/blob/e0108291a2c7a91271cb774bb130a4b8c576fb20/keras/src/optimizers/sgd.py#L19 97 velocity = momentum * velocity - learning_rate * g 98 w = w + velocity 99 */ 100 tensor* velocity = temp->velocity; 101 102 // note: using add_k_ here to avoid erasing in free_all_tensors 103 sub_k_( 104 mul_k(TensorLikeFill(velocity, momentum), velocity), 105 mul_k(TensorLikeFill(w->grad, learning_rate), w->grad), 106 velocity 107 ); 108 add_k_(w, velocity, w); 109 } 110 111 temp = temp->next; 112 } 113 } 114 115 void zero_grads(void) { 116 // // param_head is a global variable 117 // extern param* param_head; 118 param* temp = param_head; 119 while (temp){ 120 // printf("[zero_grads] %s\n", temp->value->name); 121 temp->value->grad = NULL; 122 temp = temp->next; 123 } 124 }