cvector-generator.cpp
1 #include "common.h" 2 #include "llama.h" 3 #include "ggml.h" 4 #include "pca.hpp" 5 6 #ifdef GGML_USE_CUDA 7 #include "ggml-cuda.h" 8 #endif 9 10 #ifdef GGML_USE_METAL 11 #include "ggml-metal.h" 12 #endif 13 14 #include <cstdio> 15 #include <string> 16 #include <tuple> 17 #include <vector> 18 #include <algorithm> 19 #include <iostream> 20 #include <fstream> 21 #include <climits> 22 23 24 ////////////////////////////////////////////////// 25 // utils 26 27 template <class Iter> 28 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { 29 std::string ret; 30 for (; begin != end; ++begin) { 31 ret += llama_token_to_piece(ctx, *begin); 32 } 33 34 return ret; 35 } 36 37 static void print_usage(int argc, char ** argv, const gpt_params & params) { 38 gpt_params_print_usage(argc, argv, params); 39 40 printf("\nexample usage:\n"); 41 printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]); 42 printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]); 43 printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]); 44 printf("\n"); 45 } 46 47 ////////////////////////////////////////////////// 48 49 50 // cb_eval is reused for each pair of positive - negative prompt 51 struct callback_data { 52 ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered 53 54 int n_layers = 0; 55 int n_tokens = 0; 56 bool is_eval_pos = true; 57 58 // each element of the vector correspond to one layer 59 std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens] 60 std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens] 61 std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer 62 63 // save a tensor into either v_pos or v_neg (decided by is_eval_pos) 64 void save_tensor_for_layer(struct ggml_tensor * t) { 65 GGML_ASSERT(t->type == GGML_TYPE_F32); 66 67 if (ctx_ggml == nullptr) { 68 // alloc a new ctx_ggml if needed 69 struct ggml_init_params params_ggml = { 70 /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u, 71 /*.mem_buffer =*/ NULL, 72 /*.no_alloc =*/ true, 73 }; 74 ctx_ggml = ggml_init(params_ggml); 75 } 76 77 // copy tensor data 78 auto n_bytes = ggml_nbytes(t); 79 struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]); 80 t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow 81 ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); 82 ggml_set_name(t_layer, ggml_get_name(t)); 83 //print_debug_tensor(t_layer); 84 85 if (is_eval_pos) { 86 v_pos.push_back(t_layer); 87 } else { 88 v_neg.push_back(t_layer); 89 } 90 } 91 92 // calculate diff (v_pos - v_neg) and place the result back to v_pos 93 // all zero rows in the diff tensor will also be removed 94 // NOTE: final layer is ignored. we only have (n_layers - 1) to process 95 std::vector<struct ggml_tensor *> calc_diff() { 96 for (float il = 0; il < v_pos.size(); il++) { 97 float * a = (float *) v_pos[il]->data; 98 float * b = (float *) v_neg[il]->data; 99 size_t n_elem = ggml_nelements(v_pos[il]); 100 for (size_t j = 0; j < n_elem; j++) { 101 a[j] -= b[j]; 102 } 103 //print_debug_tensor(v_pos[i]); 104 auto diff_filtered = filter_nonzero_rows(v_pos[il]); 105 v_diff_filtered.push_back(diff_filtered); 106 } 107 return v_diff_filtered; // for convinient, we return the result std::vector 108 } 109 110 // delete zero rows from a given 2D tensor 111 struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { 112 //printf("filter_nonzero_rows\n"); 113 auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { 114 // check if given row containing all zero elements 115 int n_cols = t->ne[0]; // hint: should be equal to n_embd 116 for (int col = 0; col < n_cols; ++col) { 117 if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) { 118 return false; 119 } 120 } 121 return true; 122 }; 123 std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered) 124 for (int i_row = 0; i_row < a->ne[1]; i_row++) { 125 if (!is_row_all_zeros(a, i_row, 1e-6)) { 126 rows_to_copy.push_back(i_row); 127 } 128 } 129 130 // get "n_nonzero_rows" for the output "diff_filtered" 131 int n_nonzero_rows = rows_to_copy.size(); 132 //printf("n_nonzero_rows: %d\n", n_nonzero_rows); 133 int n_embd = a->ne[0]; 134 GGML_ASSERT(n_nonzero_rows > 0); 135 136 // diff_filtered: [n_embd, n_nonzero_rows] 137 struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( 138 ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); 139 ggml_format_name(diff_filtered, "diff_filtered_%s", a->name); 140 diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); 141 142 // copy non-zero rows 143 for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { 144 int src_row = rows_to_copy[dest_row]; 145 for (int i = 0; i < n_embd; i++) { 146 float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0); 147 ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem); 148 } 149 } 150 151 //print_debug_tensor(diff_filtered); 152 153 return diff_filtered; 154 } 155 156 // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors 157 void reset() { 158 for (auto ptr : v_pos) free(ptr->data); 159 for (auto ptr : v_neg) free(ptr->data); 160 for (auto ptr : v_diff_filtered) free(ptr->data); 161 v_pos.clear(); 162 v_neg.clear(); 163 v_diff_filtered.clear(); 164 if (ctx_ggml) { 165 ggml_free(ctx_ggml); 166 } 167 ctx_ggml = nullptr; 168 } 169 }; 170 171 /** 172 * process_ctx is used to store the ggml context for pre-post processing the diff vectors 173 * in short, input => v_diff and output => v_final 174 */ 175 struct train_context { 176 ggml_context * ctx_ggml; 177 int n_embd; 178 int n_layers; 179 180 /* pair of prompts to be used for generating final vector */ 181 std::vector<std::string> positive_entries; 182 std::vector<std::string> negative_entries; 183 184 // each element of the vector correspond to one layer 185 // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here 186 // NOTE (2): v_diff is transposed from v_diff_tmp 187 std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) 188 std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file 189 190 // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor 191 // v_diff_tmp will get converted unto v_diff later on 192 std::vector<std::vector<uint8_t>> v_diff_tmp; 193 194 train_context(int n_embd_, int n_layers_) { 195 n_embd = n_embd_; 196 n_layers = n_layers_; 197 struct ggml_init_params params_ggml = { 198 /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u, 199 /*.mem_buffer =*/ NULL, 200 /*.no_alloc =*/ true, 201 }; 202 ctx_ggml = ggml_init(params_ggml); 203 for (int il = 0; il < n_layers - 1; il++) { 204 std::vector<uint8_t> empty; 205 v_diff_tmp.push_back(empty); 206 auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); 207 t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible 208 v_final.push_back(t); 209 } 210 } 211 212 // add new rows into existing tensor in v_diff_tmp 213 void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) { 214 GGML_ASSERT((int) diff_filtered.size() == n_layers - 1); 215 for (int il = 0; il < n_layers - 1; il++) { 216 auto t = diff_filtered[il]; 217 auto & diff_tmp = v_diff_tmp[il]; 218 size_t curr_size = diff_tmp.size(); 219 diff_tmp.resize(curr_size + ggml_nbytes(t)); 220 memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t)); 221 } 222 } 223 224 // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) 225 // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method 226 void build_v_diff() { 227 printf("build_v_diff\n"); 228 for (int il = 0; il < n_layers - 1; il++) { 229 auto & diff_tmp = v_diff_tmp[il]; 230 int n_elem = diff_tmp.size() / sizeof(float); 231 GGML_ASSERT(n_elem % n_embd == 0); 232 int n_rows = n_elem / n_embd; 233 struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd); 234 ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); 235 // copy data & transpose 236 diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible 237 float * arr = (float *) diff_tmp.data(); 238 for (int ir = 0; ir < n_rows; ++ir) { 239 for (int ic = 0; ic < n_embd; ++ic) { 240 float f = arr[ir*n_embd + ic]; 241 ggml_set_f32_nd(diff, ir, ic, 0, 0, f); 242 } 243 } 244 v_diff.push_back(diff); 245 print_debug_tensor(diff); 246 // free memory of diff_tmp 247 diff_tmp.resize(0); 248 } 249 } 250 251 ~train_context() { 252 for (auto ptr : v_final) free(ptr->data); 253 for (auto ptr : v_diff) free(ptr->data); 254 // no need to free v_diff_tmp, since we didn't use malloc 255 ggml_free(ctx_ggml); 256 } 257 }; 258 259 struct tokenized_prompt { 260 std::vector<llama_token> tokens_pos; 261 std::vector<llama_token> tokens_neg; 262 size_t max_seq_len; 263 264 tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { 265 const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); 266 tokens_pos = ::llama_tokenize(ctx, pos, add_bos); 267 tokens_neg = ::llama_tokenize(ctx, neg, add_bos); 268 max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); 269 padding_seq(ctx, tokens_pos, max_seq_len); 270 padding_seq(ctx, tokens_neg, max_seq_len); 271 } 272 273 void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) { 274 // TODO: customize padding token 275 std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false); 276 llama_token pad_tok = pad_tokens.back(); 277 while (tokens.size() < len) { 278 tokens.push_back(pad_tok); 279 } 280 } 281 }; 282 283 ////////////////////////////////////////////////// 284 285 template <typename T> 286 static std::string to_string(const T & val) { 287 std::stringstream ss; 288 ss << val; 289 return ss.str(); 290 } 291 292 static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) { 293 std::vector<std::string> output; 294 std::ifstream file(path); 295 if (!file.is_open()) { 296 fprintf(stderr, "error: unable to open file: %s\n", path.c_str()); 297 exit(1); 298 } 299 std::string line; 300 while (std::getline(file, line)) { 301 bool is_skip = skip_empty_lines && line.empty(); 302 if (!is_skip) { 303 string_process_escapes(line); 304 output.push_back(line); 305 } 306 } 307 file.close(); 308 return output; 309 } 310 311 ////////////////////////////////////////////////// 312 313 static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { 314 auto * cb_data = (callback_data *) user_data; 315 static const char * l_out_name = "l_out"; 316 const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; 317 318 if (ask) { 319 return is_l_out; 320 } 321 322 if (!is_l_out || t->ne[1] != cb_data->n_tokens) { 323 return true; 324 } 325 326 // save the tensor to current context 327 cb_data->save_tensor_for_layer(t); 328 return true; 329 } 330 331 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) { 332 llama_kv_cache_clear(ctx); 333 if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { 334 fprintf(stderr, "%s : failed to eval\n", __func__); 335 return false; 336 } 337 return true; 338 } 339 340 static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) { 341 struct gguf_context * ctx = gguf_init_empty(); 342 343 const std::string arch = "controlvector"; 344 gguf_set_val_str(ctx, "general.architecture", arch.c_str()); 345 gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); 346 gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size()); 347 348 for (size_t i = 0; i < v_ctrl.size(); ++i) { 349 gguf_add_tensor(ctx, v_ctrl[i]); 350 print_debug_tensor(v_ctrl[i]); 351 printf("Added tensor: %s\n", v_ctrl[i]->name); 352 } 353 354 printf("%s: writing file...\n", __func__); 355 gguf_write_to_file(ctx, fname.c_str(), false); 356 printf("%s: wrote file '%s'\n", __func__, fname.c_str()); 357 gguf_free(ctx); 358 } 359 360 /** 361 * Load prompt files and completion file. 362 * Then format each pair of prompt + completion to make an entry. 363 */ 364 static int prepare_entries(gpt_params & params, train_context & ctx_train) { 365 // load prompts 366 std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); 367 std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); 368 if (positive_prompts.size() != negative_prompts.size()) { 369 fprintf(stderr, "number of positive and negative prompts must be equal\n"); 370 return 1; 371 } 372 if (positive_prompts.empty()) { 373 fprintf(stderr, "must provide at least one prompt pair\n"); 374 return 1; 375 } 376 377 // create templated prompts 378 std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false); 379 auto format_template = [](std::string persona, std::string suffix) { 380 // entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]" 381 return persona + " " + suffix; 382 }; 383 for (size_t i = 0; i < positive_prompts.size(); ++i) { 384 for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) { 385 // TODO replicate the truncations done by the python implementation 386 ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j])); 387 ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j])); 388 } 389 } 390 return 0; 391 } 392 393 int main(int argc, char ** argv) { 394 gpt_params params; 395 396 if (!gpt_params_parse(argc, argv, params)) { 397 print_usage(argc, argv, params); 398 return 1; 399 } 400 401 if (params.n_pca_iterations % params.n_pca_batch != 0) { 402 fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n"); 403 return 1; 404 } 405 406 407 callback_data cb_data; 408 409 // pass the callback to the backend scheduler 410 // it will be executed for each node during the graph computation 411 params.cb_eval = cb_eval; 412 params.cb_eval_user_data = &cb_data; 413 params.warmup = false; 414 415 print_build_info(); 416 llama_backend_init(); 417 llama_numa_init(params.numa); 418 419 // load the model to get hparams 420 llama_model * model; 421 llama_context * ctx; 422 std::tie(model, ctx) = llama_init_from_gpt_params(params); 423 424 // int n_ctx = llama_n_ctx(ctx); 425 int n_layers = llama_n_layer(model); 426 int n_embd = llama_n_embd(model); 427 // get model hint param (a.k.a model arch name) 428 char model_hint[128]; 429 llama_model_meta_val_str(model, "general.architecture", model_hint, 128); 430 431 // init train_context 432 train_context ctx_train(n_embd, n_layers); 433 434 // load and prepare entries for training 435 prepare_entries(params, ctx_train); 436 437 // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped 438 std::vector<tokenized_prompt> tokenized_prompts; 439 size_t n_total_tokens = 0; 440 for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { 441 tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]); 442 n_total_tokens += 2 * t.max_seq_len; 443 tokenized_prompts.push_back(std::move(t)); 444 } 445 446 std::cout << "n_total_tokens: " << n_total_tokens << std::endl; 447 448 for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { 449 bool success = false; 450 tokenized_prompt t = tokenized_prompts[i]; 451 cb_data.n_layers = n_layers; 452 cb_data.n_tokens = t.max_seq_len; 453 454 printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", 455 (int) i+1, (int) ctx_train.positive_entries.size(), 456 tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), 457 tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), 458 (int) t.max_seq_len); 459 460 cb_data.is_eval_pos = true; 461 success = get_hidden_layers(ctx, t.tokens_pos); 462 if (!success) break; 463 464 cb_data.is_eval_pos = false; 465 success = get_hidden_layers(ctx, t.tokens_neg); 466 if (!success) break; 467 468 // calculate diff and remove all zero rows 469 auto v_diff_filtered = cb_data.calc_diff(); 470 471 // save & concat the filtered v_diff to ctx_train 472 ctx_train.concat_diff_tmp(v_diff_filtered); 473 474 // reset for next iteration 475 cb_data.reset(); 476 } 477 478 // done with the model, we can now free it to make gain some memory 479 printf("Done evaluate prompts, unload model...\n"); 480 llama_free(ctx); 481 llama_free_model(model); 482 483 // prepare ctx_train for PCA 484 ctx_train.build_v_diff(); 485 486 // run PCA 487 PCA::pca_params pca_params; 488 pca_params.n_threads = params.n_threads; 489 pca_params.n_batch = params.n_pca_batch; 490 pca_params.n_iterations = params.n_pca_iterations; 491 PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); 492 493 // write output vectors to gguf 494 export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); 495 496 llama_backend_free(); 497 498 return 0; 499 }