Cradicle Explorer

/ common / train.cpp
train.cpp
   1  #include "train.h"
   2  #include "common.h"
   3  
   4  #include <random>
   5  #include <sstream>
   6  #include <functional>
   7  
   8  struct random_normal_distribution {
   9      std::mt19937 gen;
  10      std::normal_distribution<float> rd;
  11      float min;
  12      float max;
  13  };
  14  
  15  struct random_uniform_distribution {
  16      std::mt19937 gen;
  17      std::uniform_real_distribution<float> rd;
  18  };
  19  
  20  struct train_state  * init_train_state() {
  21      struct train_state * state = new struct train_state;
  22      state->train_its     = 0;
  23      state->train_samples = 0;
  24      state->train_tokens  = 0;
  25      state->train_epochs  = 0;
  26      state->shuffle_samples_hash  = 0;
  27      state->shuffle_sample_count  = 0;
  28      state->shuffle_next_sample   = 0;
  29      state->shuffle_rng_state_current = "";
  30      state->shuffle_rng_state_next    = "";
  31  
  32      state->opt = new struct ggml_opt_context;
  33      state->opt->ctx = NULL;
  34      state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
  35      state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
  36      state->opt->loss_after = 0.0f;
  37  
  38      return state;
  39  }
  40  
  41  void free_train_state(struct train_state  * state) {
  42      delete state->opt;
  43      delete state;
  44  }
  45  
  46  struct random_normal_distribution * init_random_normal_distribution(
  47      int seed, float mean, float std, float min, float max
  48  ) {
  49      struct random_normal_distribution * rnd = (struct random_normal_distribution *) malloc(sizeof(struct random_normal_distribution));
  50      rnd->gen = std::mt19937(seed);
  51      rnd->rd = std::normal_distribution<float>{mean, std};
  52      rnd->min = min;
  53      rnd->max = max;
  54      return rnd;
  55  }
  56  
  57  struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) {
  58      struct random_uniform_distribution * rnd = (struct random_uniform_distribution *) malloc(sizeof(struct random_uniform_distribution));
  59      rnd->gen = std::mt19937(seed);
  60      rnd->rd = std::uniform_real_distribution<float>{min, max};
  61      return rnd;
  62  }
  63  
  64  void free_random_normal_distribution (struct random_normal_distribution  * rnd) {
  65      free(rnd);
  66  }
  67  
  68  void free_random_uniform_distribution(struct random_uniform_distribution * rnd) {
  69      free(rnd);
  70  }
  71  
  72  struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
  73      float scale = 1.0f; // xavier
  74      switch (ggml_n_dims(tensor)) {
  75          case 1:
  76              scale /= sqrtf((float) tensor->ne[0]);
  77              for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
  78                  float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
  79                  *dst = scale * frand_normal(rnd);
  80              }
  81              break;
  82          case 2:
  83              scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
  84              for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
  85                  for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
  86                      float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
  87                      *dst = scale * frand_normal(rnd);
  88                  }
  89              }
  90              break;
  91          case 3:
  92              scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
  93              for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
  94                  for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
  95                      for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
  96                          float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
  97                          *dst = scale * frand_normal(rnd);
  98                      }
  99                  }
 100              }
 101              break;
 102          case 4:
 103              scale /= sqrtf((float) tensor->ne[0]+tensor->ne[1]);
 104              for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
 105                  for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
 106                      for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
 107                          for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
 108                              float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
 109                              *dst = scale * frand_normal(rnd);
 110                          }
 111                      }
 112                  }
 113              }
 114              break;
 115          default:
 116              die("Unsupported tensor->n_dims");
 117      };
 118      return tensor;
 119  }
 120  
 121  struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
 122      switch (ggml_n_dims(tensor)) {
 123          case 1:
 124              for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
 125                  float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
 126                  *dst = frand_uniform(rnd);
 127              }
 128              break;
 129          case 2:
 130              for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
 131                  for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
 132                      float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
 133                      *dst = frand_uniform(rnd);
 134                  }
 135              }
 136              break;
 137          case 3:
 138              for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
 139                  for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
 140                      for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
 141                          float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
 142                          *dst = frand_uniform(rnd);
 143                      }
 144                  }
 145              }
 146              break;
 147          case 4:
 148              for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
 149                  for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
 150                      for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
 151                          for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
 152                              float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2] + i3*tensor->nb[3]);
 153                              *dst = frand_uniform(rnd);
 154                          }
 155                      }
 156                  }
 157              }
 158              break;
 159          default:
 160              die("Unsupported tensor->n_dims");
 161      };
 162      return tensor;
 163  }
 164  
 165  float frand() {
 166      return (float)rand()/((float)(RAND_MAX) + 1.0f);
 167  }
 168  
 169  float frand_normal(struct random_normal_distribution * rnd) {
 170      return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
 171  }
 172  
 173  float frand_uniform(struct random_uniform_distribution * rnd) {
 174      return rnd->rd(rnd->gen);
 175  }
 176  
 177  int clamp(const int v, const int min, const int max) {
 178      return ((v < min) ? (min) : (v > max) ? (max) : v);
 179  }
 180  
 181  float fclamp(const float v, const float min, const float max) {
 182      return ((v < min) ? (min) : (v > max) ? (max) : v);
 183  }
 184  
 185  void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
 186      GGML_ASSERT(tensor->ne[0] == ne0);
 187      GGML_ASSERT(tensor->ne[1] == 1);
 188      GGML_ASSERT(tensor->ne[2] == 1);
 189      GGML_ASSERT(tensor->ne[3] == 1);
 190  }
 191  
 192  void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
 193      GGML_ASSERT(tensor->ne[0] == ne0);
 194      GGML_ASSERT(tensor->ne[1] == ne1);
 195      GGML_ASSERT(tensor->ne[2] == 1);
 196      GGML_ASSERT(tensor->ne[3] == 1);
 197  }
 198  
 199  void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
 200      GGML_ASSERT(tensor->ne[0] == ne0);
 201      GGML_ASSERT(tensor->ne[1] == ne1);
 202      GGML_ASSERT(tensor->ne[2] == ne2);
 203      GGML_ASSERT(tensor->ne[3] == 1);
 204  }
 205  
 206  void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
 207      GGML_ASSERT(tensor->ne[0] == ne0);
 208      GGML_ASSERT(tensor->ne[1] == ne1);
 209      GGML_ASSERT(tensor->ne[2] == ne2);
 210      GGML_ASSERT(tensor->ne[3] == ne3);
 211  }
 212  
 213  int64_t get_example_targets_batch(
 214      struct llama_context * lctx,
 215      struct ggml_tensor   * tokens_input,
 216      struct ggml_tensor   * target_probs,
 217      int64_t                example_id,
 218      const size_t         * samples_offs,
 219      const size_t         * samples_begin,
 220      const size_t         * samples_size,
 221            size_t           samples_count,
 222      const llama_token    * train_data,
 223      size_t                 n_train_data,
 224      bool                   separate_with_eos,
 225      bool                   separate_with_bos,
 226      bool                   fill_with_next_samples,
 227      bool                   sample_random_offsets
 228  ) {
 229      GGML_ASSERT(samples_count > 0);
 230      GGML_ASSERT(ggml_is_matrix(tokens_input));
 231      GGML_ASSERT(ggml_is_3d(target_probs));
 232      int64_t n_vocab  = target_probs->ne[0];
 233      int64_t n_tokens = tokens_input->ne[0];
 234      int64_t n_batch  = tokens_input->ne[1];
 235      GGML_ASSERT(n_vocab  == target_probs->ne[0]);
 236      GGML_ASSERT(n_tokens == target_probs->ne[1]);
 237      GGML_ASSERT(n_batch  == target_probs->ne[2]);
 238  
 239      int64_t used_samples = 0;
 240  
 241      ggml_set_f32(target_probs, 0.0f);
 242      llama_token bos = llama_token_bos(llama_get_model(lctx));
 243      llama_token eos = llama_token_eos(llama_get_model(lctx));
 244      // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
 245      for (int k=0; k<n_batch; ++k) {
 246          // printf("%s: batch %d\n", __func__, k);
 247          size_t sample_idx   = (example_id + used_samples) % samples_count;
 248          size_t sample_offs  = sample_random_offsets ? samples_offs[sample_idx] : 0;
 249          size_t sample_begin = samples_begin[sample_idx];
 250          size_t sample_size  = samples_size[sample_idx];
 251          ++used_samples;
 252  
 253          // printf("%s: sample_idx=%zu sample=%zu\n", __func__, sample_idx, sample);
 254          GGML_ASSERT(sample_begin+sample_size-1 < n_train_data);
 255  
 256          ggml_set_i32_nd(tokens_input, 0, k, 0, 0, bos);
 257          bool sample_separation_eos = !separate_with_eos;
 258          bool sample_separation_bos = !separate_with_bos;
 259          for (int64_t i=0; i<n_tokens; ++i) {
 260              llama_token token = eos;
 261              if (sample_offs >= sample_size && fill_with_next_samples) {
 262                  if (!sample_separation_eos) {
 263                      // insert eos token to separate samples
 264                      sample_separation_eos = true;
 265                  } else if (!sample_separation_bos) {
 266                      // insert bos token to separate samples
 267                      sample_separation_bos = true;
 268                      token = bos;
 269                  } else {
 270                      // sample separation is done, continue with next sample
 271                      sample_separation_eos = !separate_with_eos;
 272                      sample_separation_bos = !separate_with_bos;
 273                      sample_offs  = 0;
 274                      sample_idx   = (example_id + used_samples) % samples_count;
 275                      sample_begin = samples_begin[sample_idx];
 276                      sample_size  = samples_size[sample_idx];
 277                      ++used_samples;
 278                  }
 279              }
 280              // note: no else-if here
 281              if (sample_offs < sample_size) {
 282                  token = clamp(train_data[sample_begin+sample_offs], 0, (llama_token) (n_vocab - 1));
 283                  ++sample_offs;
 284              }
 285              ggml_set_f32_nd(target_probs,  token, (int) i, (int) k, 0, +1.0f);
 286              if (i+1<n_tokens) {
 287                  ggml_set_i32_nd(tokens_input, (int) (i + 1), (int) k, 0, 0, token);
 288              }
 289          }
 290      }
 291  
 292      return used_samples;
 293  }
 294  
 295  void mt19937_set_state(std::mt19937& rng, const std::string& rng_state) {
 296      std::stringstream s_rng_state;
 297      s_rng_state.imbue(std::locale::classic());
 298      s_rng_state.exceptions(std::stringstream::failbit);
 299      s_rng_state.str(rng_state);
 300      s_rng_state >> rng;
 301  }
 302  
 303  std::string mt19937_get_state(const std::mt19937& rng) {
 304      std::stringstream s_rng_state;
 305      s_rng_state.imbue(std::locale::classic());
 306      s_rng_state << rng;
 307      return s_rng_state.str();
 308  }
 309  
 310  std::string mt19937_seed_to_state(unsigned seed) {
 311      std::mt19937 rng(seed);
 312      return mt19937_get_state(rng);
 313  }
 314  
 315  std::string shuffle_samples(
 316          const std::string & rng_state,
 317          size_t            * shuffled_offs,
 318          size_t            * shuffled_begins,
 319          size_t            * shuffled_sizes,
 320          const size_t      * begins,
 321          const size_t      * sizes,
 322          size_t              count) {
 323      if (count == 0) return rng_state;
 324  
 325      std::mt19937 rng;
 326      mt19937_set_state(rng, rng_state);
 327  
 328      // sort indices by random value for each index
 329      std::vector<size_t> idcs;
 330      {
 331          std::vector<unsigned> rnd;
 332          idcs.resize(count);
 333          rnd.resize(count);
 334          for (unsigned i=0; i<count; ++i) {
 335              idcs[i] = i;
 336              rnd[i]  = rng();
 337          }
 338  
 339          std::sort(idcs.begin(), idcs.end(), [&rnd](size_t a, size_t b){
 340              // stable sort for reproducibility
 341              return (rnd[a] == rnd[b]) ? (a < b) : (rnd[a] < rnd[b]);
 342          });
 343      }
 344  
 345      // create random offsets
 346      for (unsigned i=0; i<count; ++i) {
 347          shuffled_offs[i] = (size_t) ((sizes[idcs[i]] - 1) * ((double) rng() / (double) (rng.max()-1)));
 348      }
 349  
 350      // reorder begins and sizes by sorted indices
 351      for (unsigned i=0; i<count; ++i) {
 352          shuffled_begins[i] = begins[idcs[i]];
 353      }
 354  
 355      for (unsigned i=0; i<count; ++i) {
 356          shuffled_sizes[i] = sizes[idcs[i]];
 357      }
 358  
 359      return mt19937_get_state(rng);
 360  }
 361  
 362  size_t hash_combine(size_t h1, size_t h2) {
 363      return h1 ^ (h2 << 1);
 364  }
 365  
 366  size_t compute_samples_hash(const char* fn, const size_t* samples_begin, const size_t* samples_size, size_t sample_count) {
 367      std::hash<std::string> h_string;
 368      std::hash<unsigned long long> h_ull;
 369      size_t h = h_string(std::string(fn));
 370      h = hash_combine(h, h_ull((unsigned long long) sample_count));
 371      for (size_t i=0; i< sample_count; ++i) {
 372          h = hash_combine(h, h_ull((unsigned long long) samples_begin[i]));
 373          h = hash_combine(h, h_ull((unsigned long long) samples_size[i]));
 374      }
 375      return h;
 376  }
 377  
 378  std::string replace_str(const char * s, const char * needle, const char * replacement) {
 379      std::string str = s;
 380      size_t pos = str.find(needle);
 381      if (pos != std::string::npos) {
 382          str.replace(pos, strlen(needle), replacement);
 383      }
 384      return str;
 385  }
 386  
 387  void print_duration(double fmillis) {
 388      if (fmillis < 1000.0f) {
 389          printf("%.1fms", (float) fmillis);
 390          return;
 391      }
 392      const int64_t one_sec  = 1000;
 393      const int64_t one_min  = one_sec  * 60;
 394      const int64_t one_hour = one_min  * 60;
 395      const int64_t one_day  = one_hour * 24;
 396  
 397      int64_t millis  = (int64_t) fmillis;
 398      int64_t days    = millis/one_day;
 399      int64_t hours   = (millis - days*one_day)/one_hour;
 400      int64_t minutes = (millis - days*one_day - hours*one_hour)/one_min;
 401      int64_t seconds = (millis - days*one_day - hours*one_hour - minutes*one_min)/one_sec;
 402  
 403      // to print int64_t either cast to (long long int) or use macro PRId64 from <inttypes.h>
 404      if (days > 0) {
 405          printf("%lldd ", (long long int) days);
 406      }
 407      printf("%02lld:%02lld:%02lld", (long long int) hours, (long long int) minutes, (long long int) seconds);
 408  }
 409  
 410  float cosine_decay(int64_t step, int64_t decay_steps, float minimum) {
 411      if (step > decay_steps) {
 412          step = decay_steps;
 413      }
 414      const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps));
 415      const float decay = (1 - minimum)*cosine_decay + minimum;
 416      return decay;
 417  }
 418  
 419  float cosine_decay_restart(int64_t step, int64_t decay_steps, float minimum, float restart_step_mult) {
 420      while (step > decay_steps) {
 421          step -= decay_steps;
 422          decay_steps = (int64_t) (restart_step_mult * decay_steps);
 423      }
 424      return cosine_decay(step, decay_steps, minimum);
 425  }
 426  
 427  float learning_schedule(
 428      int64_t step,
 429      int64_t warmup_steps,
 430      int64_t cos_decay_steps,
 431      float   learning_rate,
 432      float   overall_minimum,
 433      float   cos_decay_minimum,
 434      float   cos_decay_restart_step_mult,
 435      bool    enable_restart) {
 436  
 437      float result =
 438          (step < warmup_steps)
 439              ? (float) step / (float) warmup_steps
 440              : enable_restart
 441                  ? cosine_decay_restart(
 442                      step - warmup_steps,
 443                      cos_decay_steps,
 444                      cos_decay_minimum,
 445                      cos_decay_restart_step_mult)
 446                  : cosine_decay(
 447                      step,
 448                      cos_decay_steps,
 449                      cos_decay_minimum);
 450  
 451      float min = overall_minimum / learning_rate;
 452      result = min + result * (1.0f - min);
 453      return result;
 454  }
 455  
 456  static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
 457      GGML_ASSERT(a != NULL);
 458      GGML_ASSERT(b != NULL);
 459      GGML_ASSERT(a->type == b->type);
 460      GGML_ASSERT(ggml_are_same_shape(a, b));
 461      GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b));
 462  
 463      return true;
 464  }
 465  
 466  void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
 467      if (dst == NULL) {
 468          return;
 469      }
 470      struct ggml_tensor * t  = ggml_get_tensor(ctx, name);
 471      GGML_ASSERT(are_same_layout(dst, t));
 472      memcpy(dst->data, t->data, ggml_nbytes(t));
 473  
 474      if (strlen(ggml_get_name(dst)) == 0) {
 475          ggml_set_name(dst, name);
 476      }
 477  }
 478  
 479  // gguf constants
 480  static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
 481  static const char * LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam";
 482  static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
 483  static const char * LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version";
 484  static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count";
 485  static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count";
 486  static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count";
 487  static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized";
 488  static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss";
 489  static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss";
 490  static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count";
 491  static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
 492  static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss";
 493  static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step";
 494  static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j";
 495  static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k";
 496  static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end";
 497  static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
 498  
 499  static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments";
 500  static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments";
 501  static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
 502  
 503  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters";
 504  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
 505  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients";
 506  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients";
 507  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction";
 508  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values";
 509  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha";
 510  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys";
 511  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s";
 512  static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y";
 513  
 514  static const char * LLM_KV_TRAINING_FILE_VERSION         = "training.file_version";
 515  static const char * LLM_KV_TRAINING_ITERATION_COUNT      = "training.iteration_count";
 516  static const char * LLM_KV_TRAINING_SAMPLE_COUNT         = "training.sample_count";
 517  static const char * LLM_KV_TRAINING_TOKEN_COUNT          = "training.token_count";
 518  static const char * LLM_KV_TRAINING_EPOCH_COUNT          = "training.epoch_count";
 519  static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH = "training.shuffle.samples_hash";
 520  static const char * LLM_KV_TRAINING_SHUFFLE_RNG_STATE    = "training.shuffle.rng_state";
 521  static const char * LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT = "training.shuffle.sample_count";
 522  static const char * LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE  = "training.shuffle.next_sample";
 523  
 524  #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
 525  { \
 526      const std::string skey(key); \
 527      const int kid = gguf_find_key(ctx, skey.c_str()); \
 528      if (kid >= 0) { \
 529          enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
 530          if (ktype != (type)) { \
 531              die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \
 532          } \
 533          (dst) = func(ctx, kid); \
 534      } else if (req) { \
 535          die_fmt("key not found in model: %s", skey.c_str()); \
 536      } \
 537  }
 538  
 539  void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
 540      // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
 541  
 542      uint32_t file_version;
 543      GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION);
 544      GGML_ASSERT(file_version == 0);
 545  
 546      GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT);
 547      GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT);
 548      GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED);
 549  
 550      uint64_t nx;
 551      GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT);
 552      opt->nx = (size_t) nx;
 553  
 554      // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know
 555  
 556      std::string opt_type;
 557      GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
 558      if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
 559          opt->params.type = GGML_OPT_TYPE_ADAM;
 560  
 561          GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
 562          GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
 563          GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT);
 564  
 565          ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
 566  
 567          copy_tensor_by_name(opt->adam.m,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
 568          copy_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
 569          copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
 570      } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
 571          opt->params.type = GGML_OPT_TYPE_LBFGS;
 572  
 573          GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
 574          GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
 575          GGUF_GET_KEY(fctx, opt->lbfgs.step,             gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP);
 576          GGUF_GET_KEY(fctx, opt->lbfgs.j,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J);
 577          GGUF_GET_KEY(fctx, opt->lbfgs.k,                gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K);
 578          GGUF_GET_KEY(fctx, opt->lbfgs.end,              gguf_get_val_i32, GGUF_TYPE_INT32,   true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END);
 579          GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT);
 580  
 581          ggml_opt_init(opt->ctx, opt, opt->params, opt->nx);
 582  
 583          copy_tensor_by_name(opt->lbfgs.x,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
 584          copy_tensor_by_name(opt->lbfgs.xp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
 585          copy_tensor_by_name(opt->lbfgs.g,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
 586          copy_tensor_by_name(opt->lbfgs.gp,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
 587          copy_tensor_by_name(opt->lbfgs.d,    f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
 588          copy_tensor_by_name(opt->lbfgs.pf,   f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
 589          copy_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
 590          copy_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
 591          copy_tensor_by_name(opt->lbfgs.lms,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
 592          copy_tensor_by_name(opt->lbfgs.lmy,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
 593      } else {
 594          die("unknown optimizer type\n");
 595      }
 596  }
 597  
 598  void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
 599      gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
 600      gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
 601      gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
 602      gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter);
 603      gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
 604  
 605      switch (opt->params.type) {
 606          case GGML_OPT_TYPE_ADAM:
 607              {
 608                  gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
 609                  gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
 610                  gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS,        opt->adam.fx_prev);
 611                  gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement);
 612  
 613                  ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS);
 614                  ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
 615                  if (opt->adam.pf) {
 616                      ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
 617                  }
 618  
 619                  gguf_add_tensor(fctx, opt->adam.m);
 620                  gguf_add_tensor(fctx, opt->adam.v);
 621                  if (opt->adam.pf) {
 622                      gguf_add_tensor(fctx, opt->adam.pf);
 623                  }
 624              } break;
 625          case GGML_OPT_TYPE_LBFGS:
 626              {
 627                  gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
 628                  gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
 629                  gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS,            opt->lbfgs.fx_best);
 630                  gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP,     opt->lbfgs.step);
 631                  gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J,        opt->lbfgs.j);
 632                  gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K,        opt->lbfgs.k);
 633                  gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END,      opt->lbfgs.end);
 634                  gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement);
 635  
 636                  ggml_set_name(opt->lbfgs.x,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS);
 637                  ggml_set_name(opt->lbfgs.xp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS);
 638                  ggml_set_name(opt->lbfgs.g,    LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS);
 639                  ggml_set_name(opt->lbfgs.gp,   LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS);
 640                  ggml_set_name(opt->lbfgs.d,    LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION);
 641                  if (opt->lbfgs.pf) {
 642                      ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES);
 643                  }
 644                  ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA);
 645                  ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS);
 646                  ggml_set_name(opt->lbfgs.lms,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S);
 647                  ggml_set_name(opt->lbfgs.lmy,  LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y);
 648  
 649                  gguf_add_tensor(fctx, opt->lbfgs.x);
 650                  gguf_add_tensor(fctx, opt->lbfgs.xp);
 651                  gguf_add_tensor(fctx, opt->lbfgs.g);
 652                  gguf_add_tensor(fctx, opt->lbfgs.gp);
 653                  gguf_add_tensor(fctx, opt->lbfgs.d);
 654                  if (opt->lbfgs.pf) {
 655                      gguf_add_tensor(fctx, opt->lbfgs.pf);
 656                  }
 657                  gguf_add_tensor(fctx, opt->lbfgs.lmal);
 658                  gguf_add_tensor(fctx, opt->lbfgs.lmys);
 659                  gguf_add_tensor(fctx, opt->lbfgs.lms);
 660                  gguf_add_tensor(fctx, opt->lbfgs.lmy);
 661              } break;
 662      }
 663  }
 664  
 665  bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train) {
 666      if (gguf_find_key(fctx, LLM_KV_TRAINING_FILE_VERSION) < 0) {
 667          return false;
 668      }
 669  
 670      uint32_t file_version;
 671      GGUF_GET_KEY(fctx, file_version,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION);
 672      GGML_ASSERT(file_version <= 1);
 673  
 674      if (file_version == 0) {
 675  
 676          GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT);
 677          GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT);
 678          GGUF_GET_KEY(fctx, train->train_tokens,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT);
 679  
 680      } else if (file_version == 1) {
 681  
 682          GGUF_GET_KEY(fctx, train->train_its,     gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_ITERATION_COUNT);
 683          GGUF_GET_KEY(fctx, train->train_samples, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_SAMPLE_COUNT);
 684          GGUF_GET_KEY(fctx, train->train_tokens,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_TOKEN_COUNT);
 685          GGUF_GET_KEY(fctx, train->train_epochs,  gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_TRAINING_EPOCH_COUNT);
 686  
 687          GGUF_GET_KEY(fctx, train->shuffle_samples_hash,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH);
 688          GGUF_GET_KEY(fctx, train->shuffle_rng_state_current, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_SHUFFLE_RNG_STATE);
 689          GGUF_GET_KEY(fctx, train->shuffle_sample_count,      gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT);
 690          GGUF_GET_KEY(fctx, train->shuffle_next_sample,       gguf_get_val_u64, GGUF_TYPE_UINT64, false, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE);
 691      }
 692  
 693      load_opt_context_gguf(fctx, f_ggml_ctx, train->opt);
 694      return true;
 695  }
 696  
 697  void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train) {
 698      gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION,    1);
 699      gguf_set_val_u64(fctx, LLM_KV_TRAINING_ITERATION_COUNT, train->train_its);
 700      gguf_set_val_u64(fctx, LLM_KV_TRAINING_SAMPLE_COUNT,    train->train_samples);
 701      gguf_set_val_u64(fctx, LLM_KV_TRAINING_TOKEN_COUNT,     train->train_tokens);
 702      gguf_set_val_u64(fctx, LLM_KV_TRAINING_EPOCH_COUNT,     train->train_epochs);
 703  
 704      gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLES_HASH, (uint64_t) train->shuffle_samples_hash);
 705      gguf_set_val_str(fctx, LLM_KV_TRAINING_SHUFFLE_RNG_STATE,    train->shuffle_rng_state_current.c_str());
 706      gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_SAMPLE_COUNT, (uint64_t) train->shuffle_sample_count);
 707      gguf_set_val_u64(fctx, LLM_KV_TRAINING_SHUFFLE_NEXT_SAMPLE,  (uint64_t) train->shuffle_next_sample);
 708  
 709      save_opt_context_gguf(fctx, train->opt);
 710  }
 711  
 712  
 713  struct llama_file {
 714      // use FILE * so we don't have to re-open the file to mmap
 715      FILE * fp;
 716      size_t size;
 717  
 718      llama_file(const char * fname, const char * mode) {
 719          fp = std::fopen(fname, mode);
 720          if (fp == NULL) {
 721              size = 0;
 722          } else {
 723              seek(0, SEEK_END);
 724              size = tell();
 725              seek(0, SEEK_SET);
 726          }
 727      }
 728  
 729      size_t tell() const {
 730  #ifdef _WIN32
 731          __int64 ret = _ftelli64(fp);
 732  #else
 733          long ret = std::ftell(fp);
 734  #endif
 735          GGML_ASSERT(ret != -1); // this really shouldn't fail
 736          return (size_t) ret;
 737      }
 738  
 739      void seek(size_t offset, int whence) {
 740  #ifdef _WIN32
 741          int ret = _fseeki64(fp, (__int64) offset, whence);
 742  #else
 743          int ret = std::fseek(fp, (long) offset, whence);
 744  #endif
 745          GGML_ASSERT(ret == 0); // same
 746      }
 747  
 748      void read_raw(void * ptr, size_t size) {
 749          if (size == 0) {
 750              return;
 751          }
 752          errno = 0;
 753          std::size_t ret = std::fread(ptr, size, 1, fp);
 754          if (ferror(fp)) {
 755              die_fmt("read error: %s", strerror(errno));
 756          }
 757          if (ret != 1) {
 758              die("unexpectedly reached end of file");
 759          }
 760      }
 761  
 762      std::uint32_t read_u32() {
 763          std::uint32_t ret;
 764          read_raw(&ret, sizeof(ret));
 765          return ret;
 766      }
 767  
 768      std::string read_string(std::uint32_t len) {
 769          std::vector<char> chars(len);
 770          read_raw(chars.data(), len);
 771          return std::string(chars.data(), len);
 772      }
 773  
 774      void write_raw(const void * ptr, size_t size) {
 775          if (size == 0) {
 776              return;
 777          }
 778          errno = 0;
 779          size_t ret = std::fwrite(ptr, size, 1, fp);
 780          if (ret != 1) {
 781              die_fmt("write error: %s", strerror(errno));
 782          }
 783      }
 784  
 785      void write_u32(std::uint32_t val) {
 786          write_raw(&val, sizeof(val));
 787      }
 788  
 789      ~llama_file() {
 790          if (fp) {
 791              std::fclose(fp);
 792          }
 793      }
 794  };
 795  
 796  static size_t utf8_len(char src) {
 797      const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
 798      uint8_t highbits = static_cast<uint8_t>(src) >> 4;
 799      return lookup[highbits];
 800  }
 801  
 802  // mark each byte with its utf8 unit number.
 803  // returns the number of utf8 characters.
 804  // e.g. when bytes == '\x61\xD0\xB0\x62',
 805  // then utf8_units will become [0,0,1,0]
 806  // utf8_nunits will become [1,2,2,1] and 3 is returned.
 807  // bytes where utf8_units is zero, are the begin of an utf8 character.
 808  static size_t mark_utf8_units(const char* bytes, int * utf8_units, int * utf8_nunits, size_t count) {
 809      size_t offs = 0;
 810      size_t count_utf8 = 0;
 811      while(offs < count) {
 812          int len = (int) utf8_len(bytes[offs]);
 813          for (int i=0; i<len; ++i) {
 814              utf8_units[offs+i]  = i;
 815              utf8_nunits[offs+i] = len;
 816          }
 817          offs += len;
 818          ++count_utf8;
 819      }
 820      return count_utf8;
 821  }
 822  
 823  size_t tokenize_file(
 824          struct llama_context     * lctx,
 825          const char               * filename,
 826          const std::string        & sample_start,
 827          bool                       include_sample_start,
 828          bool                       overlapping_samples,
 829          unsigned                   context_length,
 830          std::vector<llama_token> & out_tokens,
 831          std::vector<size_t>      & out_samples_begin,
 832          std::vector<size_t>      & out_samples_size) {
 833      struct llama_file f(filename, "rb");
 834  
 835      if (f.size == 0) {
 836          out_tokens.clear();
 837          out_samples_begin.clear();
 838          out_samples_size.clear();
 839          printf("%s: warning: empty or not existing training data file '%s'\n",
 840              __func__, filename);
 841          return out_tokens.size();
 842      }
 843  
 844      // account for possible leading whitespace that will be added by tokenizer
 845      // e.g. '\t' will be tokenized by llama spm tokenizer to [29871, 12]
 846      const int n_max_tokens_overhead = 1;
 847  
 848      std::vector<char> buf;
 849      buf.resize(f.size);
 850  
 851      f.read_raw(buf.data(), f.size);
 852  
 853      std::vector<int> utf8_units;
 854      std::vector<int> utf8_nunits;
 855      utf8_units.resize(buf.size());
 856      utf8_nunits.resize(buf.size());
 857      mark_utf8_units(buf.data(), utf8_units.data(), utf8_nunits.data(), buf.size());
 858  
 859      if (sample_start.size() == 0) {
 860          // tokenize all data at once
 861          out_tokens.resize(buf.size() + n_max_tokens_overhead);
 862  
 863          int n_tokens = llama_tokenize(
 864              llama_get_model(lctx),
 865              buf.data(),
 866              (int) buf.size(),
 867              out_tokens.data(),
 868              (int) out_tokens.size(),
 869              false, false);
 870          if (n_tokens < 0) {
 871              out_tokens.resize(-n_tokens);
 872              n_tokens = llama_tokenize(
 873                  llama_get_model(lctx),
 874                  buf.data(),
 875                  (int) buf.size(),
 876                  out_tokens.data(),
 877                  (int) out_tokens.size(),
 878                  false, false);
 879          }
 880          if (n_tokens >= 0) {
 881              out_tokens.resize(n_tokens);
 882          }
 883  
 884          // generate sample starts at all token positions
 885          out_samples_begin.clear();
 886          out_samples_begin.push_back(0);
 887          out_samples_size.push_back(std::min((size_t) context_length, out_tokens.size()));
 888          size_t end = (out_tokens.size() >= context_length) ? (out_tokens.size() - context_length) : 0;
 889          for (size_t sample_begin = 1; sample_begin < end; ++sample_begin) {
 890              out_samples_begin.push_back(sample_begin);
 891              out_samples_size.push_back(context_length);
 892          }
 893      } else {
 894          // split data into samples and tokenize each sample
 895          std::string data_str(buf.data(), buf.size());
 896          out_samples_begin.clear();
 897          out_samples_size.clear();
 898          out_tokens.clear();
 899  
 900          // find all positions of pattern sample_start
 901          size_t sample_begin = data_str.find(sample_start, 0);
 902          while (sample_begin != std::string::npos) {
 903              out_samples_begin.push_back(sample_begin);
 904              const size_t search_start = sample_begin + sample_start.size();
 905              sample_begin = data_str.find(sample_start, search_start);
 906          }
 907          if (out_samples_begin.size() == 0) {
 908              printf("%s: warning: sample start pattern '%s' not found. inserting single sample at data begin\n",
 909                  __func__, sample_start.c_str());
 910              out_samples_begin.push_back(0);
 911          }
 912  
 913          out_samples_size.resize(out_samples_begin.size(), 0);
 914  
 915          std::vector<char>        buf_sample;
 916          std::vector<llama_token> tok_sample;
 917  
 918          const size_t sample_begin_offset = (include_sample_start ? 0 : sample_start.size());
 919          size_t found_too_big_sample   = 0;
 920          size_t found_too_small_sample = 0;
 921          size_t found_empty_sample     = 0;
 922          size_t found_min_sample_size  = SIZE_MAX;
 923          size_t found_max_sample_size  = 0;
 924  
 925          size_t max_token_text_size = 0;
 926          int n_vocab = llama_n_vocab(llama_get_model(lctx));
 927          for (llama_token token=0; token < n_vocab; ++token) {
 928              max_token_text_size = std::max(
 929                  max_token_text_size,
 930                  strlen(llama_token_get_text(llama_get_model(lctx), token)));
 931          }
 932  
 933          // upper bound of context byte length.
 934          // strings with this byte length should always tokenize to at least context_length tokens.
 935          size_t context_byte_len = max_token_text_size*context_length;
 936  
 937          for (unsigned i=0; i<out_samples_begin.size(); ++i) {
 938              // determine sample begin and end from pattern positions
 939              size_t sample_begin = out_samples_begin[i] + sample_begin_offset;
 940              size_t sample_end   = overlapping_samples
 941                                      ? std::min(
 942                                          data_str.size(),
 943                                          sample_begin + context_byte_len)
 944                                      : (i+1 < out_samples_begin.size()
 945                                          ? out_samples_begin[i+1]
 946                                          : data_str.size());
 947              if (sample_end < utf8_units.size() && utf8_units[sample_end] > 0) {
 948                  // sample end is in the middle of an utf8 character.
 949                  // advance sample_end to the begin of the next utf8 character.
 950                  sample_end += utf8_nunits[sample_end] - utf8_units[sample_end];
 951              }
 952              size_t sample_size = sample_end - sample_begin;
 953              if (sample_size == 0) {
 954                  ++found_empty_sample;
 955              }
 956  
 957              if (sample_size > 0) {
 958                  // llama_tokenize expects zero terminated string,
 959                  // copy sample into buffer and zero terminate it.
 960                  buf_sample.resize(sample_size);
 961                  memcpy(buf_sample.data(), data_str.data() + sample_begin, sample_size);
 962  
 963                  // printf("sample: '%s'\n", buf_sample.data());
 964  
 965                  // tokenize the sample
 966                  tok_sample.resize(buf_sample.size() + n_max_tokens_overhead);
 967                  int n_tokens = llama_tokenize(llama_get_model(lctx),
 968                      buf_sample.data(),
 969                      (int) buf_sample.size(),
 970                      tok_sample.data(),
 971                      (int) tok_sample.size(),
 972                      false, false);
 973                  if (n_tokens < 0) {
 974                      tok_sample.resize(-n_tokens);
 975                      n_tokens = llama_tokenize(llama_get_model(lctx),
 976                          buf_sample.data(),
 977                          (int) buf_sample.size(),
 978                          tok_sample.data(),
 979                          (int) tok_sample.size(),
 980                          false, false);
 981                      GGML_ASSERT(n_tokens >= 0);
 982                  }
 983                  GGML_ASSERT(n_tokens <= (int) tok_sample.size());
 984  
 985                  if ((size_t) n_tokens > context_length) {
 986                      ++found_too_big_sample;
 987                  } else if ((size_t) n_tokens < context_length) {
 988                      ++found_too_small_sample;
 989                  }
 990                  found_max_sample_size = std::max(found_max_sample_size, (size_t) n_tokens);
 991                  found_min_sample_size = std::min(found_min_sample_size, (size_t) n_tokens);
 992  
 993                  // write out tokens, start and size of sample
 994                  // overwrite the string start position with the token start position
 995                  out_samples_begin[i] = out_tokens.size();
 996                  out_samples_size[i] = (size_t) n_tokens;
 997                  out_tokens.insert(out_tokens.end(), tok_sample.begin(), tok_sample.begin() + n_tokens);
 998              } else {
 999                  out_samples_begin[i] = out_tokens.size();
1000                  out_samples_size[i] = 0;
1001              }
1002  
1003          }
1004          if (found_too_big_sample > 0) {
1005              printf("%s: warning: found %zu samples (max length %zu) that exceed context length of %u. samples will be cut off.\n",
1006                  __func__, found_too_big_sample, found_max_sample_size, context_length);
1007          }
1008  
1009          if (found_too_small_sample > 0) {
1010              printf("%s: warning: found %zu samples (min length %zu) that are shorter than context length of %u.\n",
1011                  __func__, found_too_small_sample, found_min_sample_size, context_length);
1012          }
1013  
1014          if (found_empty_sample) {
1015              printf("%s: warning: found %zu empty samples.\n",
1016                  __func__, found_empty_sample);
1017          }
1018      }
1019      printf("%s: total number of samples: %zu\n",
1020          __func__, out_samples_begin.size());
1021  
1022      GGML_ASSERT(out_samples_begin.size() == out_samples_size.size());
1023  
1024      return out_tokens.size();
1025  }
1026  
1027  std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration) {
1028      std::string sit = (iteration >= 0) ? std::to_string(iteration) : std::string(latest);
1029      return replace_str(filename, pattern_it, sit.c_str());
1030  }
1031  
1032  struct train_params_common get_default_train_params_common() {
1033      struct train_params_common params;
1034      params.fn_train_data     = "shakespeare.txt";
1035      params.fn_checkpoint_in  = "checkpoint.gguf";
1036      params.fn_checkpoint_out = "checkpoint-ITERATION.gguf";
1037      params.pattern_fn_it     = "ITERATION";
1038      params.fn_latest         = "LATEST";
1039  
1040      params.print_usage = false;
1041  
1042      params.save_every = 10;
1043  
1044      params.seed       =   -1;
1045  
1046      params.n_ctx      =  128;
1047      params.n_threads  =    6;
1048      params.n_batch    =    8;
1049      params.n_gradient_accumulation = 1;
1050      params.n_epochs   = -1;
1051      params.n_gpu_layers = 0;
1052  
1053      params.custom_n_ctx = false;
1054  
1055      params.use_flash              = false;
1056      params.use_checkpointing      = true;
1057  
1058      params.sample_start           = "";
1059      params.include_sample_start   = false;
1060      params.escape                 = false;
1061      params.overlapping_samples    = false;
1062      params.fill_with_next_samples = false;
1063      params.separate_with_eos      = false;
1064      params.separate_with_bos      = true;
1065      params.sample_random_offsets  = false;
1066      params.force_reshuffle        = false;
1067  
1068      params.opt_past               = 0;
1069      params.opt_delta              = 1e-5f;
1070      params.opt_max_no_improvement = 0;
1071  
1072      params.warmup            =  100;
1073      params.cos_decay_steps   = 1000;
1074      params.cos_decay_restart = 1.1f;
1075      params.cos_decay_min     = 0.1f;
1076      params.enable_restart    = false;
1077  
1078      params.adam_n_iter         = 256;
1079      params.adam_alpha          = 1e-3f;
1080      params.adam_min_alpha      = 0;
1081      params.adam_decay          = 1e-1f;
1082      params.adam_decay_min_ndim = 2;
1083      params.adam_beta1          = 0.9f;
1084      params.adam_beta2          = 0.999f;
1085      params.adam_gclip          = 1.0f;
1086      params.adam_eps_f          = 0.0f;
1087  
1088      return params;
1089  }
1090  
1091  void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train_params_common * params) {
1092      // fprintf(stderr, "usage: %s [options]\n", argv[0]);
1093      // fprintf(stderr, "\n");
1094      // fprintf(stderr, "options:\n");
1095      // fprintf(stderr, "  -h, --help                 show this help message and exit\n");
1096      fprintf(stderr, "  --train-data FNAME         path from which to load training data (default '%s')\n", params->fn_train_data);
1097      fprintf(stderr, "  --checkpoint-in FNAME      path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
1098      fprintf(stderr, "  --checkpoint-out FNAME     path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
1099      fprintf(stderr, "  --pattern-fn-it STR        pattern in output filenames to be replaced by iteration number (default '%s')\n", params->pattern_fn_it);
1100      fprintf(stderr, "  --fn-latest STR            string to use instead of iteration number for saving latest output (default '%s')\n", params->fn_latest);
1101      fprintf(stderr, "  --save-every N             save checkpoint and lora every N iterations. Disabled when N <= 0. (default '%d')\n", params->save_every);
1102      fprintf(stderr, "  -s SEED, --seed SEED       RNG seed (default: -1, use random seed for -1)\n");
1103      fprintf(stderr, "  -c N, --ctx N              Context size used during training (default %d)\n", params->n_ctx);
1104      fprintf(stderr, "  -t N, --threads N          Number of threads (default %d)\n", params->n_threads);
1105      fprintf(stderr, "  -b N, --batch N            Parallel batch size (default %d)\n", params->n_batch);
1106      fprintf(stderr, "  --grad-acc N               Number of gradient accumulation steps (simulates larger batch size of batch*gradacc) (default %d)\n", params->n_gradient_accumulation);
1107      fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
1108      fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
1109      fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
1110      fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
1111      fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
1112      fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
1113      fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
1114      fprintf(stderr, "  --no-separate-with-eos     When fill-with-next-samples, don't insert end-of-sequence token between samples.%s\n", !params->separate_with_eos ? " (default)" : "");
1115      fprintf(stderr, "  --no-separate-with-bos     When fill-with-next-samples, don't insert begin-of-sequence token between samples.%s\n", !params->separate_with_bos ? " (default)" : "");
1116      fprintf(stderr, "  --sample-random-offsets    Use samples beginning at random offsets. Together with fill-with-next-samples this may help for training endless text generation.%s\n", params->sample_random_offsets ? " (default)" : "");
1117      fprintf(stderr, "  --force-reshuffle          Force a reshuffling of data at program start, otherwise the shuffling of loaded checkpoint is resumed.\n");
1118      fprintf(stderr, "  --no-flash                 Don't use flash attention \n");
1119      fprintf(stderr, "  --use-flash                Use flash attention (default)\n");
1120      fprintf(stderr, "  --no-checkpointing         Don't use gradient checkpointing\n");
1121      fprintf(stderr, "  --use-checkpointing        Use gradient checkpointing (default)\n");
1122      fprintf(stderr, "  --warmup N                 Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup);
1123      fprintf(stderr, "  --cos-decay-steps N        Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps);
1124      fprintf(stderr, "  --cos-decay-restart N      Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart);
1125      fprintf(stderr, "  --cos-decay-min N          Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min);
1126      fprintf(stderr, "  --enable-restart N         Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : "");
1127      fprintf(stderr, "  --disable-restart N        Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : "");
1128      fprintf(stderr, "  --opt-past N               Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past);
1129      fprintf(stderr, "  --opt-delta N              Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta);
1130      fprintf(stderr, "  --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement);
1131      fprintf(stderr, "  --epochs N                 Maximum number epochs to process. (default %d)\n", params->n_epochs);
1132      fprintf(stderr, "  --adam-iter N              Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter);
1133      fprintf(stderr, "  --adam-alpha N             Adam learning rate alpha (default %f)\n", params->adam_alpha);
1134      fprintf(stderr, "  --adam-min-alpha N         Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha);
1135      fprintf(stderr, "  --adam-decay N             AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay);
1136      fprintf(stderr, "  --adam-decay-min-ndim N    Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim);
1137      fprintf(stderr, "  --adam-beta1 N             AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1);
1138      fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
1139      fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
1140      fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
1141      fprintf(stderr, "  -ngl N, --n-gpu-layers N   Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
1142      fprintf(stderr, "\n");
1143  }
1144  
1145  bool consume_common_train_arg(
1146      int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param
1147  ) {
1148      int& i = *idx;
1149      std::string arg = argv[i];
1150      const std::string arg_prefix = "--";
1151      if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
1152          std::replace(arg.begin(), arg.end(), '_', '-');
1153      }
1154      if (arg == "--train-data") {
1155          if (++i >= argc) {
1156              *invalid_param = true;
1157              return true;
1158          }
1159          params->fn_train_data = argv[i];
1160      } else if (arg == "--checkpoint-in") {
1161          if (++i >= argc) {
1162              *invalid_param = true;
1163              return true;
1164          }
1165          params->fn_checkpoint_in = argv[i];
1166      } else if (arg == "--checkpoint-out") {
1167          if (++i >= argc) {
1168              *invalid_param = true;
1169              return true;
1170          }
1171          params->fn_checkpoint_out = argv[i];
1172      } else if (arg == "--pattern-fn-it") {
1173          if (++i >= argc) {
1174              *invalid_param = true;
1175              return true;
1176          }
1177          params->pattern_fn_it = argv[i];
1178      } else if (arg == "--fn-latest") {
1179          if (++i >= argc) {
1180              *invalid_param = true;
1181              return true;
1182          }
1183          params->fn_latest = argv[i];
1184      } else if (arg == "--save-every") {
1185          if (++i >= argc) {
1186              *invalid_param = true;
1187              return true;
1188          }
1189          params->save_every = std::stoi(argv[i]);
1190      } else if (arg == "-s" || arg == "--seed") {
1191          if (++i >= argc) {
1192              *invalid_param = true;
1193              return true;
1194          }
1195          params->seed = std::stoi(argv[i]);
1196      } else if (arg == "-c" || arg == "--ctx") {
1197          if (++i >= argc) {
1198              *invalid_param = true;
1199              return true;
1200          }
1201          params->n_ctx = std::stoi(argv[i]);
1202          params->custom_n_ctx = true;
1203      } else if (arg == "-t" || arg == "--threads") {
1204          if (++i >= argc) {
1205              *invalid_param = true;
1206              return true;
1207          }
1208          params->n_threads = std::stoi(argv[i]);
1209      } else if (arg == "-b" || arg == "--batch") {
1210          if (++i >= argc) {
1211              *invalid_param = true;
1212              return true;
1213          }
1214          params->n_batch = std::stoi(argv[i]);
1215      } else if (arg == "--grad-acc") {
1216          if (++i >= argc) {
1217              *invalid_param = true;
1218              return true;
1219          }
1220          params->n_gradient_accumulation = std::max(1, std::stoi(argv[i]));
1221      } else if (arg == "--sample-start") {
1222          if (++i >= argc) {
1223              *invalid_param = true;
1224              return true;
1225          }
1226          params->sample_start = std::string(argv[i]);
1227      } else if (arg == "--escape") {
1228          params->escape = true;
1229      } else if (arg == "--include-sample-start") {
1230          params->include_sample_start = true;
1231      } else if (arg == "--overlapping-samples") {
1232          params->overlapping_samples = true;
1233      } else if (arg == "--fill-with-next-samples") {
1234          params->fill_with_next_samples = true;
1235      } else if (arg == "--separate-with-eos") {
1236          params->separate_with_eos = true;
1237      } else if (arg == "--separate-with-bos") {
1238          params->separate_with_bos = true;
1239      } else if (arg == "--no-separate-with-eos") {
1240          params->separate_with_eos = false;
1241      } else if (arg == "--no-separate-with-bos") {
1242          params->separate_with_bos = false;
1243      } else if (arg == "--sample-random-offsets") {
1244          params->sample_random_offsets = true;
1245      } else if (arg == "--force-reshuffle") {
1246          params->force_reshuffle = true;
1247      } else if (arg == "--no-flash") {
1248          params->use_flash = false;
1249      } else if (arg == "--use-flash") {
1250          params->use_flash = true;
1251      } else if (arg == "--no-checkpointing") {
1252          params->use_checkpointing = false;
1253      } else if (arg == "--use-checkpointing") {
1254          params->use_checkpointing = true;
1255      } else if (arg == "--warmup") {
1256          if (++i >= argc) {
1257              *invalid_param = true;
1258              return true;
1259          }
1260          params->warmup = std::stoi(argv[i]);
1261      } else if (arg == "--cos-decay-steps") {
1262          if (++i >= argc) {
1263              *invalid_param = true;
1264              return true;
1265          }
1266          params->cos_decay_steps = std::stoi(argv[i]);
1267      } else if (arg == "--cos-decay-restart") {
1268          if (++i >= argc) {
1269              *invalid_param = true;
1270              return true;
1271          }
1272          params->cos_decay_restart = std::stof(argv[i]);
1273      } else if (arg == "--cos-decay-min") {
1274          if (++i >= argc) {
1275              *invalid_param = true;
1276              return true;
1277          }
1278          params->cos_decay_min = std::stof(argv[i]);
1279      } else if (arg == "--enable-restart") {
1280          params->enable_restart = true;
1281      } else if (arg == "--disable-restart") {
1282          params->enable_restart = false;
1283      } else if (arg == "--opt-past") {
1284          if (++i >= argc) {
1285              *invalid_param = true;
1286              return true;
1287          }
1288          params->opt_past = std::stoi(argv[i]);
1289      } else if (arg == "--opt-delta") {
1290          if (++i >= argc) {
1291              *invalid_param = true;
1292              return true;
1293          }
1294          params->opt_delta = std::stof(argv[i]);
1295      } else if (arg == "--opt-max-no-improvement") {
1296          if (++i >= argc) {
1297              *invalid_param = true;
1298              return true;
1299          }
1300          params->opt_max_no_improvement = std::stoi(argv[i]);
1301      } else if (arg == "--adam-epsf") {
1302          if (++i >= argc) {
1303              *invalid_param = true;
1304              return true;
1305          }
1306          params->adam_eps_f = std::stof(argv[i]);
1307      } else if (arg == "--epochs") {
1308          if (++i >= argc) {
1309              *invalid_param = true;
1310              return true;
1311          }
1312          params->n_epochs = std::stoi(argv[i]);
1313      } else if (arg == "--adam-iter") {
1314          if (++i >= argc) {
1315              *invalid_param = true;
1316              return true;
1317          }
1318          params->adam_n_iter = std::stoi(argv[i]);
1319      } else if (arg == "--adam-alpha") {
1320          if (++i >= argc) {
1321              *invalid_param = true;
1322              return true;
1323          }
1324          params->adam_alpha = std::stof(argv[i]);
1325      } else if (arg == "--adam-min-alpha") {
1326          if (++i >= argc) {
1327              *invalid_param = true;
1328              return true;
1329          }
1330          params->adam_min_alpha = std::stof(argv[i]);
1331      } else if (arg == "--adam-decay") {
1332          if (++i >= argc) {
1333              *invalid_param = true;
1334              return true;
1335          }
1336          params->adam_decay = std::stof(argv[i]);
1337      } else if (arg == "--adam-decay-min-ndim") {
1338          if (++i >= argc) {
1339              *invalid_param = true;
1340              return true;
1341          }
1342          params->adam_decay_min_ndim = std::stoi(argv[i]);
1343      } else if (arg == "--adam-beta1") {
1344          if (++i >= argc) {
1345              *invalid_param = true;
1346              return true;
1347          }
1348          params->adam_beta1 = std::stof(argv[i]);
1349      } else if (arg == "--adam-beta2") {
1350          if (++i >= argc) {
1351              *invalid_param = true;
1352              return true;
1353          }
1354          params->adam_beta2 = std::stof(argv[i]);
1355      } else if (arg == "--adam-gclip") {
1356          if (++i >= argc) {
1357              *invalid_param = true;
1358              return true;
1359          }
1360          params->adam_gclip = std::stof(argv[i]);
1361      } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
1362              if (++i >= argc) {
1363                  *invalid_param = true;
1364                  return true;
1365              }
1366              if (llama_supports_gpu_offload()) {
1367                  params->n_gpu_layers = std::stoi(argv[i]);
1368              } else {
1369                  fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
1370                  fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
1371              }
1372      } else if (arg == "-h" || arg == "--help") {
1373          params->print_usage = true;
1374          return true;
1375      } else {
1376          return false;
1377      }
1378      return true;
1379  }
1380  
1381  void finish_processing_train_args(struct train_params_common * params) {
1382      if (params->escape) {
1383          string_process_escapes(params->sample_start);
1384      }
1385  }
1386  
1387  void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
1388      struct train_opt_callback_data * data   = (struct train_opt_callback_data *) vdata;
1389      struct train_params_common     * params = data->params;
1390      struct train_state             * train  = data->train;
1391      struct ggml_opt_context        * opt    = train->opt;
1392      int n_batch = params->n_batch;
1393      int n_ctx = params->n_ctx;
1394  
1395      if (accum_step == 0) {
1396          // time measurement
1397          int64_t now = ggml_time_ms();
1398          if (now > data->last_time && opt->iter > data->first_iter) {
1399              double dt = (double) (now - data->last_time);
1400              if (data->millis_per_iter == 0.0) {
1401                  data->millis_per_iter = dt;
1402              } else {
1403                  const double gain = 0.7;
1404                  data->millis_per_iter = data->millis_per_iter*(1.0-gain) + dt*gain;
1405              }
1406          }
1407  
1408          double remaining_millis = 0.0;
1409          if (data->millis_per_iter > 0.0) {
1410              const int n_iter = params->adam_n_iter;
1411              const int done_iter = opt->iter - data->first_iter;
1412              const int remaining_iter = n_iter - done_iter;
1413              remaining_millis = remaining_iter * data->millis_per_iter;
1414          }
1415  
1416          // file saving
1417          const bool save_now = (params->save_every > 0) && (opt->iter - data->last_save_iter >= params->save_every);
1418          if (save_now) {
1419              int new_iters = opt->iter - data->last_save_iter;
1420              train->train_its    += new_iters;
1421              train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_ctx;
1422  
1423              if (data->save_cb) {
1424                  data->save_cb(data->save_data, train);
1425              }
1426  
1427              data->last_save_iter = opt->iter;
1428          }
1429  
1430          // exclude file saving from time measurement, by measuring last_time after saving
1431          data->last_time = ggml_time_ms();
1432  
1433          *sched = learning_schedule(
1434              opt->iter,
1435              params->warmup,
1436              params->cos_decay_steps,
1437              params->adam_alpha,
1438              params->adam_min_alpha,
1439              params->cos_decay_min,
1440              params->cos_decay_restart,
1441              params->enable_restart);
1442  
1443          int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
1444          if (impr_plot > 0) impr_plot = 0;
1445          if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
1446          printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
1447              __func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
1448              *sched, opt->loss_after);
1449  
1450  
1451          if (data->millis_per_iter > 0) {
1452              printf(" dt=");
1453              print_duration(data->millis_per_iter);
1454              printf(" eta=");
1455              print_duration(remaining_millis);
1456          }
1457  
1458          float improvement = opt->loss_before - opt->loss_after;
1459          const float plot_scale = 10.0f;
1460          int bar_len = (int)(1 + improvement*plot_scale + 0.5);
1461          printf(" |");
1462          for (int i=0; i<bar_len; ++i) {
1463              printf("-");
1464          }
1465          printf(">");
1466          printf("\n");
1467      }
1468  
1469      int64_t used_samples = get_example_targets_batch(
1470          data->lctx,
1471          data->tokens_input,
1472          data->target_probs,
1473          train->shuffle_next_sample,
1474          data->shuffled_samples_offs,
1475          data->shuffled_samples_begin,
1476          data->shuffled_samples_size,
1477          data->samples_count,
1478          data->tokens_data,
1479          data->tokens_size,
1480          params->separate_with_eos,
1481          params->separate_with_bos,
1482          params->fill_with_next_samples,
1483          params->sample_random_offsets);
1484  
1485      train->train_samples += used_samples;
1486      train->shuffle_next_sample += used_samples;
1487  
1488      if (train->shuffle_next_sample >= train->shuffle_sample_count) {
1489          ++train->train_epochs;
1490          printf("%s: reshuffle samples. completed epochs: %llu\n", __func__, (long long unsigned) train->train_epochs);
1491          // note: we may have used some samples from the current shuffling more than once
1492          train->shuffle_rng_state_current = train->shuffle_rng_state_next;
1493          train->shuffle_rng_state_next = shuffle_samples(
1494              train->shuffle_rng_state_current,
1495              data->shuffled_samples_offs,
1496              data->shuffled_samples_begin,
1497              data->shuffled_samples_size,
1498              data->samples_begin,
1499              data->samples_size,
1500              data->samples_count);
1501          train->shuffle_next_sample = 0;
1502      }
1503  
1504      const bool last_epoch_reached = (params->n_epochs > 0 && (int64_t) train->train_epochs - data->first_epoch >= params->n_epochs);
1505      if (last_epoch_reached) {
1506          // allow optimization iteration at last epoch to be completed before canceling
1507          if (data->iter_at_last_epoch < 0) {
1508              data->iter_at_last_epoch = opt->iter;
1509          } else if (opt->iter > data->iter_at_last_epoch) {
1510              *cancel = true;
1511          }
1512      }
1513  }