/ optim.cpp
optim.cpp
  1  
  2  void adam(float learning_rate) {
  3      param* temp = param_head;
  4  
  5      while (temp){
  6          tensor* w = temp->value;
  7  
  8          int t = temp->t;
  9          tensor* beta1 = TensorLikeFill(w, temp->beta1);
 10          tensor* beta2 = TensorLikeFill(w, temp->beta2);
 11          tensor* epsilon = TensorLikeFill(w, temp->epsilon);
 12          tensor* first_moment = temp->first_moment;
 13          tensor* second_moment = temp->second_moment;
 14  
 15          t++;
 16  
 17          // first_moment = beta1 * first_moment + (1 - beta1) * dw
 18          add_k_(
 19              mul_k(beta1, first_moment),
 20              mul_k(sub(TensorLikeFill(beta1, 1.), beta1), w->grad),
 21              first_moment
 22          );
 23  
 24          // second_moment = beta2 * second_moment + (1 - beta2) * dw**2
 25          add_k_(
 26              mul_k(beta2, second_moment),
 27              mul_k(sub_k(TensorLikeFill(beta2, 1.), beta2), pow_k(w->grad, 2)),
 28              second_moment
 29          );
 30  
 31          // first_unbias = first_moment / (1 - beta1**t)
 32          tensor* first_unbias = div_k(
 33              first_moment,
 34              sub_k(
 35                  TensorLikeFill(beta1, 1.),
 36                  pow_k(beta1, t)
 37              )
 38          );
 39  
 40          // second_unbias = second_moment / (1 - beta2**t)
 41          tensor* second_unbias = div_k(
 42              second_moment,
 43              sub_k(
 44                  TensorLikeFill(beta2, 1.),
 45                  pow_k(beta2, t)
 46              )
 47          );
 48  
 49          // w -= lr * first_unbias / (np.sqrt(second_unbias) + epsilon)
 50          sub_k_(
 51              w,
 52              div_k(
 53                  mul_k(
 54                      TensorLikeFill(first_unbias, learning_rate),
 55                      first_unbias
 56                  ),
 57                  add_k(
 58                      sqrt_k(second_unbias),
 59                      epsilon
 60                  )
 61              ),
 62              w
 63          );
 64  
 65          temp = temp->next;
 66      }
 67  }
 68  
 69  
 70  // todo: mv to composite_ops
 71  void sgd(float learning_rate, float momentum) {
 72      // // param_head is a global variable
 73      // extern param* param_head;
 74      param* temp = param_head;
 75      // iterate over the linked list of params
 76      while (temp){
 77          // printf("[sgd] %s\n", temp->value->name);
 78          tensor* w = temp->value;
 79  
 80          if (momentum == 0.){
 81              // more convenient way (than the approach below) is to not
 82              // even create out tensor (new weight) in the first place
 83              sub_k_(w, mul_k(w->grad, TensorLikeFill(w->grad, learning_rate)), w);
 84  
 85              // // assign to the "w->data" because I want the weight to be persistent --
 86              // //  my first N tensors (weights, dataset) in the GC list are never removed from the GC;
 87              // //  but if I were to just replace the entire old weight tensor with the new weight tensor,
 88              // //  the new weight tensor would be cleaned up next epoch (bc it's in the end of the
 89              // //  GC list -- not one of the first N elements)
 90              // cudaFree(w->data);
 91              // tensor* new_w = sub_k(w, mul_k(w->grad, TensorLikeFill(w->grad, learning_rate)));
 92              // GC_IDX--; // don't track "new_w"
 93              // w->data = new_w->data;
 94          } else {
 95              /*
 96              https://github.com/keras-team/keras/blob/e0108291a2c7a91271cb774bb130a4b8c576fb20/keras/src/optimizers/sgd.py#L19
 97              velocity = momentum * velocity - learning_rate * g
 98              w = w + velocity
 99              */
100              tensor* velocity = temp->velocity;
101  
102              // note: using add_k_ here to avoid erasing in free_all_tensors
103              sub_k_(
104                  mul_k(TensorLikeFill(velocity, momentum), velocity),
105                  mul_k(TensorLikeFill(w->grad, learning_rate), w->grad),
106                  velocity
107              );
108              add_k_(w, velocity, w);
109          }
110  
111          temp = temp->next;
112      }
113  }
114  
115  void zero_grads(void) {
116      // // param_head is a global variable
117      // extern param* param_head;
118      param* temp = param_head;
119      while (temp){
120          // printf("[zero_grads] %s\n", temp->value->name);
121          temp->value->grad = NULL;
122          temp = temp->next;
123      }
124  }