/ examples / cvector-generator / cvector-generator.cpp
cvector-generator.cpp
  1  #include "common.h"
  2  #include "llama.h"
  3  #include "ggml.h"
  4  #include "pca.hpp"
  5  
  6  #ifdef GGML_USE_CUDA
  7  #include "ggml-cuda.h"
  8  #endif
  9  
 10  #ifdef GGML_USE_METAL
 11  #include "ggml-metal.h"
 12  #endif
 13  
 14  #include <cstdio>
 15  #include <string>
 16  #include <tuple>
 17  #include <vector>
 18  #include <algorithm>
 19  #include <iostream>
 20  #include <fstream>
 21  #include <climits>
 22  
 23  
 24  //////////////////////////////////////////////////
 25  // utils
 26  
 27  template <class Iter>
 28  static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
 29      std::string ret;
 30      for (; begin != end; ++begin) {
 31          ret += llama_token_to_piece(ctx, *begin);
 32      }
 33  
 34      return ret;
 35  }
 36  
 37  static void print_usage(int argc, char ** argv, const gpt_params & params) {
 38      gpt_params_print_usage(argc, argv, params);
 39  
 40      printf("\nexample usage:\n");
 41      printf("\n    CPU only:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
 42      printf("\n    with GPU:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
 43      printf("\n    advanced:   %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
 44      printf("\n");
 45  }
 46  
 47  //////////////////////////////////////////////////
 48  
 49  
 50  // cb_eval is reused for each pair of positive - negative prompt
 51  struct callback_data {
 52      ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
 53  
 54      int n_layers = 0;
 55      int n_tokens = 0;
 56      bool is_eval_pos = true;
 57  
 58      // each element of the vector correspond to one layer
 59      std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
 60      std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
 61      std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
 62  
 63      // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
 64      void save_tensor_for_layer(struct ggml_tensor * t) {
 65          GGML_ASSERT(t->type == GGML_TYPE_F32);
 66  
 67          if (ctx_ggml == nullptr) {
 68              // alloc a new ctx_ggml if needed
 69              struct ggml_init_params params_ggml = {
 70                  /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
 71                  /*.mem_buffer =*/ NULL,
 72                  /*.no_alloc   =*/ true,
 73              };
 74              ctx_ggml = ggml_init(params_ggml);
 75          }
 76  
 77          // copy tensor data
 78          auto n_bytes = ggml_nbytes(t);
 79          struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
 80          t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
 81          ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
 82          ggml_set_name(t_layer, ggml_get_name(t));
 83          //print_debug_tensor(t_layer);
 84  
 85          if (is_eval_pos) {
 86              v_pos.push_back(t_layer);
 87          } else {
 88              v_neg.push_back(t_layer);
 89          }
 90      }
 91  
 92      // calculate diff (v_pos - v_neg) and place the result back to v_pos
 93      // all zero rows in the diff tensor will also be removed
 94      // NOTE: final layer is ignored. we only have (n_layers - 1) to process
 95      std::vector<struct ggml_tensor *> calc_diff() {
 96          for (float il = 0; il < v_pos.size(); il++) {
 97              float * a = (float *) v_pos[il]->data;
 98              float * b = (float *) v_neg[il]->data;
 99              size_t n_elem = ggml_nelements(v_pos[il]);
100              for (size_t j = 0; j < n_elem; j++) {
101                  a[j] -= b[j];
102              }
103              //print_debug_tensor(v_pos[i]);
104              auto diff_filtered = filter_nonzero_rows(v_pos[il]);
105              v_diff_filtered.push_back(diff_filtered);
106          }
107          return v_diff_filtered; // for convinient, we return the result std::vector
108      }
109  
110      // delete zero rows from a given 2D tensor
111      struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
112          //printf("filter_nonzero_rows\n");
113          auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
114              // check if given row containing all zero elements
115              int n_cols = t->ne[0]; // hint: should be equal to n_embd
116              for (int col = 0; col < n_cols; ++col) {
117                  if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
118                      return false;
119                  }
120              }
121              return true;
122          };
123          std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
124          for (int i_row = 0; i_row < a->ne[1]; i_row++) {
125              if (!is_row_all_zeros(a, i_row, 1e-6)) {
126                  rows_to_copy.push_back(i_row);
127              }
128          }
129  
130          // get "n_nonzero_rows" for the output "diff_filtered"
131          int n_nonzero_rows = rows_to_copy.size();
132          //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
133          int n_embd = a->ne[0];
134          GGML_ASSERT(n_nonzero_rows > 0);
135  
136          // diff_filtered: [n_embd, n_nonzero_rows]
137          struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
138              ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
139          ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
140          diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
141  
142          // copy non-zero rows
143          for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
144              int src_row = rows_to_copy[dest_row];
145              for (int i = 0; i < n_embd; i++) {
146                  float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
147                  ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
148              }
149          }
150  
151          //print_debug_tensor(diff_filtered);
152  
153          return diff_filtered;
154      }
155  
156      // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
157      void reset() {
158          for (auto ptr : v_pos) free(ptr->data);
159          for (auto ptr : v_neg) free(ptr->data);
160          for (auto ptr : v_diff_filtered) free(ptr->data);
161          v_pos.clear();
162          v_neg.clear();
163          v_diff_filtered.clear();
164          if (ctx_ggml) {
165              ggml_free(ctx_ggml);
166          }
167          ctx_ggml = nullptr;
168      }
169  };
170  
171  /**
172   * process_ctx is used to store the ggml context for pre-post processing the diff vectors
173   * in short, input => v_diff and output => v_final
174   */
175  struct train_context {
176      ggml_context * ctx_ggml;
177      int n_embd;
178      int n_layers;
179  
180      /* pair of prompts to be used for generating final vector */
181      std::vector<std::string> positive_entries;
182      std::vector<std::string> negative_entries;
183  
184      // each element of the vector correspond to one layer
185      // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
186      // NOTE (2): v_diff is transposed from v_diff_tmp
187      std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
188      std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
189  
190      // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
191      // v_diff_tmp will get converted unto v_diff later on
192      std::vector<std::vector<uint8_t>> v_diff_tmp;
193  
194      train_context(int n_embd_, int n_layers_) {
195          n_embd = n_embd_;
196          n_layers = n_layers_;
197          struct ggml_init_params params_ggml = {
198              /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
199              /*.mem_buffer =*/ NULL,
200              /*.no_alloc   =*/ true,
201          };
202          ctx_ggml = ggml_init(params_ggml);
203          for (int il = 0; il < n_layers - 1; il++) {
204              std::vector<uint8_t> empty;
205              v_diff_tmp.push_back(empty);
206              auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
207              t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
208              v_final.push_back(t);
209          }
210      }
211  
212      // add new rows into existing tensor in v_diff_tmp
213      void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
214          GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
215          for (int il = 0; il < n_layers - 1; il++) {
216              auto t = diff_filtered[il];
217              auto & diff_tmp = v_diff_tmp[il];
218              size_t curr_size = diff_tmp.size();
219              diff_tmp.resize(curr_size + ggml_nbytes(t));
220              memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
221          }
222      }
223  
224      // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
225      // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
226      void build_v_diff() {
227          printf("build_v_diff\n");
228          for (int il = 0; il < n_layers - 1; il++) {
229              auto & diff_tmp = v_diff_tmp[il];
230              int n_elem = diff_tmp.size() / sizeof(float);
231              GGML_ASSERT(n_elem % n_embd == 0);
232              int n_rows = n_elem / n_embd;
233              struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
234              ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
235              // copy data & transpose
236              diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
237              float * arr = (float *) diff_tmp.data();
238              for (int ir = 0; ir < n_rows; ++ir) {
239                  for (int ic = 0; ic < n_embd; ++ic) {
240                      float f = arr[ir*n_embd + ic];
241                      ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
242                  }
243              }
244              v_diff.push_back(diff);
245              print_debug_tensor(diff);
246              // free memory of diff_tmp
247              diff_tmp.resize(0);
248          }
249      }
250  
251      ~train_context() {
252          for (auto ptr : v_final) free(ptr->data);
253          for (auto ptr : v_diff) free(ptr->data);
254          // no need to free v_diff_tmp, since we didn't use malloc
255          ggml_free(ctx_ggml);
256      }
257  };
258  
259  struct tokenized_prompt {
260      std::vector<llama_token> tokens_pos;
261      std::vector<llama_token> tokens_neg;
262      size_t max_seq_len;
263  
264      tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
265          const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
266          tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
267          tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
268          max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
269          padding_seq(ctx, tokens_pos, max_seq_len);
270          padding_seq(ctx, tokens_neg, max_seq_len);
271      }
272  
273      void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
274          // TODO: customize padding token
275          std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
276          llama_token pad_tok = pad_tokens.back();
277          while (tokens.size() < len) {
278              tokens.push_back(pad_tok);
279          }
280      }
281  };
282  
283  //////////////////////////////////////////////////
284  
285  template <typename T>
286  static std::string to_string(const T & val) {
287      std::stringstream ss;
288      ss << val;
289      return ss.str();
290  }
291  
292  static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
293      std::vector<std::string> output;
294      std::ifstream file(path);
295      if (!file.is_open()) {
296          fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
297          exit(1);
298      }
299      std::string line;
300      while (std::getline(file, line)) {
301          bool is_skip = skip_empty_lines && line.empty();
302          if (!is_skip) {
303              string_process_escapes(line);
304              output.push_back(line);
305          }
306      }
307      file.close();
308      return output;
309  }
310  
311  //////////////////////////////////////////////////
312  
313  static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
314      auto * cb_data = (callback_data *) user_data;
315      static const char * l_out_name = "l_out";
316      const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
317  
318      if (ask) {
319          return is_l_out;
320      }
321  
322      if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
323          return true;
324      }
325  
326      // save the tensor to current context
327      cb_data->save_tensor_for_layer(t);
328      return true;
329  }
330  
331  static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
332      llama_kv_cache_clear(ctx);
333      if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
334          fprintf(stderr, "%s : failed to eval\n", __func__);
335          return false;
336      }
337      return true;
338  }
339  
340  static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
341      struct gguf_context * ctx = gguf_init_empty();
342  
343      const std::string arch = "controlvector";
344      gguf_set_val_str(ctx, "general.architecture", arch.c_str());
345      gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
346      gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
347  
348      for (size_t i = 0; i < v_ctrl.size(); ++i) {
349          gguf_add_tensor(ctx, v_ctrl[i]);
350          print_debug_tensor(v_ctrl[i]);
351          printf("Added tensor: %s\n", v_ctrl[i]->name);
352      }
353  
354      printf("%s: writing file...\n", __func__);
355      gguf_write_to_file(ctx, fname.c_str(), false);
356      printf("%s: wrote file '%s'\n", __func__, fname.c_str());
357      gguf_free(ctx);
358  }
359  
360  /**
361   * Load prompt files and completion file.
362   * Then format each pair of prompt + completion to make an entry.
363   */
364  static int prepare_entries(gpt_params & params, train_context & ctx_train) {
365      // load prompts
366      std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
367      std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
368      if (positive_prompts.size() != negative_prompts.size()) {
369          fprintf(stderr, "number of positive and negative prompts must be equal\n");
370          return 1;
371      }
372      if (positive_prompts.empty()) {
373          fprintf(stderr, "must provide at least one prompt pair\n");
374          return 1;
375      }
376  
377      // create templated prompts
378      std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
379      auto format_template = [](std::string persona, std::string suffix) {
380          // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
381          return persona + " " + suffix;
382      };
383      for (size_t i = 0; i < positive_prompts.size(); ++i) {
384          for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
385              // TODO replicate the truncations done by the python implementation
386              ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
387              ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
388          }
389      }
390      return 0;
391  }
392  
393  int main(int argc, char ** argv) {
394      gpt_params params;
395  
396      if (!gpt_params_parse(argc, argv, params)) {
397          print_usage(argc, argv, params);
398          return 1;
399      }
400  
401      if (params.n_pca_iterations % params.n_pca_batch != 0) {
402          fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
403          return 1;
404      }
405  
406  
407      callback_data cb_data;
408  
409      // pass the callback to the backend scheduler
410      // it will be executed for each node during the graph computation
411      params.cb_eval = cb_eval;
412      params.cb_eval_user_data = &cb_data;
413      params.warmup = false;
414  
415      print_build_info();
416      llama_backend_init();
417      llama_numa_init(params.numa);
418  
419      // load the model to get hparams
420      llama_model * model;
421      llama_context * ctx;
422      std::tie(model, ctx) = llama_init_from_gpt_params(params);
423  
424      // int n_ctx = llama_n_ctx(ctx);
425      int n_layers = llama_n_layer(model);
426      int n_embd = llama_n_embd(model);
427      // get model hint param (a.k.a model arch name)
428      char model_hint[128];
429      llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
430  
431      // init train_context
432      train_context ctx_train(n_embd, n_layers);
433  
434      // load and prepare entries for training
435      prepare_entries(params, ctx_train);
436  
437      // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
438      std::vector<tokenized_prompt> tokenized_prompts;
439      size_t n_total_tokens = 0;
440      for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
441          tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
442          n_total_tokens += 2 * t.max_seq_len;
443          tokenized_prompts.push_back(std::move(t));
444      }
445  
446      std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
447  
448      for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
449          bool success = false;
450          tokenized_prompt t = tokenized_prompts[i];
451          cb_data.n_layers = n_layers;
452          cb_data.n_tokens = t.max_seq_len;
453  
454          printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
455              (int) i+1, (int) ctx_train.positive_entries.size(),
456              tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
457              tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
458              (int) t.max_seq_len);
459  
460          cb_data.is_eval_pos = true;
461          success = get_hidden_layers(ctx, t.tokens_pos);
462          if (!success) break;
463  
464          cb_data.is_eval_pos = false;
465          success = get_hidden_layers(ctx, t.tokens_neg);
466          if (!success) break;
467  
468          // calculate diff and remove all zero rows
469          auto v_diff_filtered = cb_data.calc_diff();
470  
471          // save & concat the filtered v_diff to ctx_train
472          ctx_train.concat_diff_tmp(v_diff_filtered);
473  
474          // reset for next iteration
475          cb_data.reset();
476      }
477  
478      // done with the model, we can now free it to make gain some memory
479      printf("Done evaluate prompts, unload model...\n");
480      llama_free(ctx);
481      llama_free_model(model);
482  
483      // prepare ctx_train for PCA
484      ctx_train.build_v_diff();
485  
486      // run PCA
487      PCA::pca_params pca_params;
488      pca_params.n_threads = params.n_threads;
489      pca_params.n_batch = params.n_pca_batch;
490      pca_params.n_iterations = params.n_pca_iterations;
491      PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
492  
493      // write output vectors to gguf
494      export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
495  
496      llama_backend_free();
497  
498      return 0;
499  }