infill.cpp
1 #include "common.h" 2 3 #include "console.h" 4 #include "llama.h" 5 #include "grammar-parser.h" 6 7 #include <cassert> 8 #include <cinttypes> 9 #include <cmath> 10 #include <cstdio> 11 #include <cstring> 12 #include <ctime> 13 #include <fstream> 14 #include <iostream> 15 #include <sstream> 16 #include <string> 17 #include <vector> 18 19 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) 20 #include <signal.h> 21 #include <unistd.h> 22 #elif defined (_WIN32) 23 #define WIN32_LEAN_AND_MEAN 24 #ifndef NOMINMAX 25 #define NOMINMAX 26 #endif 27 #include <windows.h> 28 #include <signal.h> 29 #endif 30 31 #if defined(_MSC_VER) 32 #pragma warning(disable: 4244 4267) // possible loss of data 33 #endif 34 35 static llama_context ** g_ctx; 36 static llama_model ** g_model; 37 static gpt_params * g_params; 38 static std::vector<llama_token> * g_input_tokens; 39 static std::ostringstream * g_output_ss; 40 static std::vector<llama_token> * g_output_tokens; 41 42 static bool is_interacting = false; 43 44 static void write_logfile( 45 const llama_context * ctx, const gpt_params & params, const llama_model * model, 46 const std::vector<llama_token> & input_tokens, const std::string & output, 47 const std::vector<llama_token> & output_tokens 48 ) { 49 if (params.logdir.empty()) { 50 return; 51 } 52 53 const std::string timestamp = string_get_sortable_timestamp(); 54 55 const bool success = fs_create_directory_with_parents(params.logdir); 56 if (!success) { 57 fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", 58 __func__, params.logdir.c_str()); 59 return; 60 } 61 62 const std::string logfile_path = params.logdir + timestamp + ".yml"; 63 FILE * logfile = fopen(logfile_path.c_str(), "w"); 64 65 if (logfile == NULL) { 66 fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); 67 return; 68 } 69 70 fprintf(logfile, "binary: infill\n"); 71 char model_desc[128]; 72 llama_model_desc(model, model_desc, sizeof(model_desc)); 73 yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc); 74 75 fprintf(logfile, "\n"); 76 fprintf(logfile, "######################\n"); 77 fprintf(logfile, "# Generation Results #\n"); 78 fprintf(logfile, "######################\n"); 79 fprintf(logfile, "\n"); 80 81 yaml_dump_string_multiline(logfile, "output", output.c_str()); 82 yaml_dump_vector_int(logfile, "output_tokens", output_tokens); 83 84 llama_dump_timing_info_yaml(logfile, ctx); 85 fclose(logfile); 86 } 87 88 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) 89 static void sigint_handler(int signo) { 90 if (signo == SIGINT) { 91 if (!is_interacting) { 92 is_interacting = true; 93 } else { 94 console::cleanup(); 95 printf("\n"); 96 llama_print_timings(*g_ctx); 97 write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); 98 _exit(130); 99 } 100 } 101 } 102 #endif 103 104 int main(int argc, char ** argv) { 105 gpt_params params; 106 llama_sampling_params & sparams = params.sparams; 107 g_params = ¶ms; 108 109 if (!gpt_params_parse(argc, argv, params)) { 110 gpt_params_print_usage(argc, argv, params); 111 return 1; 112 } 113 114 #ifndef LOG_DISABLE_LOGS 115 log_set_target(log_filename_generator("infill", "log")); 116 LOG_TEE("Log start\n"); 117 log_dump_cmdline(argc, argv); 118 #endif // LOG_DISABLE_LOGS 119 120 console::init(params.simple_io, params.use_color); 121 atexit([]() { console::cleanup(); }); 122 123 if (params.logits_all) { 124 printf("\n************\n"); 125 printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); 126 printf("************\n\n"); 127 128 return 0; 129 } 130 131 if (params.embedding) { 132 printf("\n************\n"); 133 printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); 134 printf("************\n\n"); 135 136 return 0; 137 } 138 139 if (params.n_ctx != 0 && params.n_ctx < 8) { 140 LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); 141 params.n_ctx = 8; 142 } 143 if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { 144 printf("\n************\n"); 145 printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); 146 printf("************\n\n"); 147 148 return 0; 149 } 150 151 if (params.rope_freq_base != 0.0) { 152 LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); 153 } 154 155 if (params.rope_freq_scale != 0.0) { 156 LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); 157 } 158 159 LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); 160 LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); 161 162 if (params.seed == LLAMA_DEFAULT_SEED) { 163 params.seed = time(NULL); 164 } 165 166 LOG_TEE("%s: seed = %u\n", __func__, params.seed); 167 168 std::mt19937 rng(params.seed); 169 170 LOG("%s: llama backend init\n", __func__); 171 llama_backend_init(); 172 llama_numa_init(params.numa); 173 174 llama_model * model; 175 llama_context * ctx; 176 177 g_model = &model; 178 g_ctx = &ctx; 179 180 // load the model and apply lora adapter, if any 181 LOG("%s: load the model and apply lora adapter, if any\n", __func__); 182 std::tie(model, ctx) = llama_init_from_gpt_params(params); 183 184 if (model == NULL) { 185 LOG_TEE("%s: error: unable to load model\n", __func__); 186 return 1; 187 } 188 189 const int n_ctx_train = llama_n_ctx_train(model); 190 const int n_ctx = llama_n_ctx(ctx); 191 LOG("n_ctx: %d\n", n_ctx); 192 193 if (n_ctx > n_ctx_train) { 194 LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", 195 __func__, n_ctx_train, n_ctx); 196 } 197 198 // print system information 199 { 200 LOG_TEE("\n"); 201 LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str()); 202 } 203 const bool add_bos = llama_should_add_bos_token(model); 204 GGML_ASSERT(llama_add_eos_token(model) != 1); 205 LOG("add_bos: %d\n", add_bos); 206 207 bool suff_rm_leading_spc = params.escape; 208 if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { 209 params.input_suffix.erase(0, 1); 210 suff_rm_leading_spc = false; 211 } 212 std::vector<llama_token> embd_inp; 213 std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); 214 std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); 215 const int space_token = 29871; 216 if (suff_rm_leading_spc && inp_sfx[0] == space_token) { 217 inp_sfx.erase(inp_sfx.begin()); 218 } 219 inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); 220 if (add_bos) { 221 inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); 222 } 223 inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); 224 embd_inp = inp_pfx; 225 embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); 226 embd_inp.push_back(llama_token_middle(model)); 227 228 LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); 229 LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); 230 LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); 231 232 // Should not run without any tokens 233 if (embd_inp.empty()) { 234 embd_inp.push_back(llama_token_bos(model)); 235 LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); 236 } 237 238 if ((int) embd_inp.size() > n_ctx - 4) { 239 LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); 240 return 1; 241 } 242 243 // number of tokens to keep when resetting context 244 if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) { 245 params.n_keep = (int)embd_inp.size(); 246 } 247 248 LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); 249 LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); 250 251 252 // enable interactive mode if interactive start is specified 253 if (params.interactive_first) { 254 params.interactive = true; 255 } 256 257 if (params.verbose_prompt) { 258 LOG_TEE("\n"); 259 LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); 260 LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); 261 for (int i = 0; i < (int) embd_inp.size(); i++) { 262 LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); 263 } 264 265 if (params.n_keep > 0) { 266 LOG_TEE("%s: static prompt based on n_keep: '", __func__); 267 for (int i = 0; i < params.n_keep; i++) { 268 LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); 269 } 270 LOG_TEE("'\n"); 271 } 272 LOG_TEE("\n"); 273 } 274 275 if (params.interactive) { 276 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) 277 struct sigaction sigint_action; 278 sigint_action.sa_handler = sigint_handler; 279 sigemptyset (&sigint_action.sa_mask); 280 sigint_action.sa_flags = 0; 281 sigaction(SIGINT, &sigint_action, NULL); 282 #elif defined (_WIN32) 283 auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { 284 return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; 285 }; 286 SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); 287 #endif 288 289 LOG_TEE("%s: interactive mode on.\n", __func__); 290 291 if (params.input_prefix_bos) { 292 LOG_TEE("Input prefix with BOS\n"); 293 } 294 295 if (!params.input_prefix.empty()) { 296 LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); 297 } 298 299 if (!params.input_suffix.empty()) { 300 LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); 301 } 302 } 303 LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); 304 LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); 305 LOG_TEE("\n\n"); 306 307 LOG_TEE("\n##### Infill mode #####\n\n"); 308 if (params.infill) { 309 printf("\n************\n"); 310 printf("no need to specify '--infill', always running infill\n"); 311 printf("************\n\n"); 312 } 313 if (params.interactive) { 314 const char *control_message; 315 if (params.multiline_input) { 316 control_message = " - To return control to LLaMA, end your input with '\\'.\n" 317 " - To return control without starting a new line, end your input with '/'.\n"; 318 } else { 319 control_message = " - Press Return to return control to LLaMA.\n" 320 " - To return control without starting a new line, end your input with '/'.\n" 321 " - If you want to submit another line, end your input with '\\'.\n"; 322 } 323 LOG_TEE("== Running in interactive mode. ==\n"); 324 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) 325 LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); 326 #endif 327 LOG_TEE( "%s\n", control_message); 328 329 is_interacting = params.interactive_first; 330 } 331 332 bool input_echo = true; 333 334 int n_past = 0; 335 int n_remain = params.n_predict; 336 int n_consumed = 0; 337 338 std::vector<int> input_tokens; g_input_tokens = &input_tokens; 339 std::vector<int> output_tokens; g_output_tokens = &output_tokens; 340 std::ostringstream output_ss; g_output_ss = &output_ss; 341 342 // the first thing we will do is to output the prompt, so set color accordingly 343 console::set_display(console::prompt); 344 345 std::vector<llama_token> embd; 346 347 struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); 348 349 while (n_remain != 0 || params.interactive) { 350 // predict 351 if (!embd.empty()) { 352 // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via 353 // --prompt or --file which uses the same value. 354 int max_embd_size = n_ctx - 4; 355 356 // Ensure the input doesn't exceed the context size by truncating embd if necessary. 357 if ((int) embd.size() > max_embd_size) { 358 const int skipped_tokens = (int) embd.size() - max_embd_size; 359 embd.resize(max_embd_size); 360 361 console::set_display(console::error); 362 printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); 363 console::set_display(console::reset); 364 fflush(stdout); 365 } 366 367 // infinite text generation via context swapping 368 // if we run out of context: 369 // - take the n_keep first tokens from the original prompt (via n_past) 370 // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches 371 if (n_past + (int) embd.size() > n_ctx) { 372 if (params.n_predict == -2) { 373 LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); 374 break; 375 } 376 377 const int n_left = n_past - params.n_keep - 1; 378 const int n_discard = n_left/2; 379 380 LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", 381 n_past, n_left, n_ctx, params.n_keep, n_discard); 382 383 llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); 384 llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); 385 386 n_past -= n_discard; 387 388 LOG("after swap: n_past = %d\n", n_past); 389 390 LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); 391 392 } 393 394 // evaluate tokens in batches 395 // embd is typically prepared beforehand to fit within a batch, but not always 396 for (int i = 0; i < (int) embd.size(); i += params.n_batch) { 397 int n_eval = (int) embd.size() - i; 398 if (n_eval > params.n_batch) { 399 n_eval = params.n_batch; 400 } 401 402 LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); 403 404 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { 405 LOG_TEE("%s : failed to eval\n", __func__); 406 return 1; 407 } 408 409 n_past += n_eval; 410 411 LOG("n_past = %d\n", n_past); 412 } 413 414 } 415 416 embd.clear(); 417 418 if ((int) embd_inp.size() <= n_consumed && !is_interacting) { 419 const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr); 420 421 llama_sampling_accept(ctx_sampling, ctx, id, true); 422 423 LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); 424 425 embd.push_back(id); 426 427 // echo this to console 428 input_echo = true; 429 430 // decrement remaining sampling budget 431 --n_remain; 432 433 LOG("n_remain: %d\n", n_remain); 434 } else { 435 // some user input remains from prompt or interaction, forward it to processing 436 LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); 437 while ((int) embd_inp.size() > n_consumed) { 438 embd.push_back(embd_inp[n_consumed]); 439 440 // push the prompt in the sampling context in order to apply repetition penalties later 441 // for the prompt, we don't apply grammar rules 442 llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); 443 444 ++n_consumed; 445 if ((int) embd.size() >= params.n_batch) { 446 break; 447 } 448 } 449 } 450 451 // display text 452 if (input_echo) { 453 for (auto id : embd) { 454 const std::string token_str = llama_token_to_piece(ctx, id); 455 printf("%s", token_str.c_str()); 456 457 if (embd.size() > 1) { 458 input_tokens.push_back(id); 459 } else { 460 output_tokens.push_back(id); 461 output_ss << token_str; 462 } 463 } 464 fflush(stdout); 465 } 466 // reset color to default if we there is no pending user input 467 if (input_echo && (int) embd_inp.size() == n_consumed) { 468 console::set_display(console::reset); 469 } 470 471 // if not currently processing queued inputs; 472 if ((int) embd_inp.size() <= n_consumed) { 473 // deal with eot token in infill mode 474 if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ 475 if (is_interacting && !params.interactive_first) { 476 // print an eot token 477 printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); 478 } 479 fflush(stdout); 480 printf("\n"); 481 console::set_display(console::user_input); 482 std::string buffer; 483 std::string line; 484 bool another_line=true; 485 // set a new prefix via stdin 486 do { 487 another_line = console::readline(line, params.multiline_input); 488 buffer += line; 489 } while (another_line); 490 // check if we got an empty line, if so we use the old input 491 if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { 492 params.input_prefix = buffer; 493 } 494 buffer.clear(); 495 // set a new suffix via stdin 496 do { 497 another_line = console::readline(line, params.multiline_input); 498 buffer += line; 499 } while (another_line); 500 // check if we got an empty line 501 if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { 502 params.input_suffix = buffer; 503 } 504 buffer.clear(); 505 // done taking input, reset color 506 console::set_display(console::reset); 507 508 if (params.escape) { 509 //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here 510 string_process_escapes(params.input_prefix); 511 string_process_escapes(params.input_suffix); 512 } 513 suff_rm_leading_spc = params.escape; 514 if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { 515 params.input_suffix.erase(0, 1); 516 suff_rm_leading_spc = false; 517 } 518 // tokenize new prefix and suffix 519 std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); 520 std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); 521 if (suff_rm_leading_spc && inp_sfx[0] == space_token) { 522 inp_sfx.erase(inp_sfx.begin()); 523 } 524 inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); 525 if (add_bos) { 526 inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); 527 } 528 inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); 529 embd_inp = inp_pfx; 530 embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); 531 embd_inp.push_back(llama_token_middle(model)); 532 embd.clear(); 533 n_remain = params.n_predict; 534 n_past = 0; 535 n_consumed = 0; 536 // LOG_TEE("took new input\n"); 537 is_interacting = false; 538 } 539 // deal with end of generation tokens in interactive mode 540 else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { 541 LOG("found EOS token\n"); 542 543 if (params.interactive) { 544 545 is_interacting = true; 546 printf("\n"); 547 console::set_display(console::user_input); 548 fflush(stdout); 549 } 550 } 551 552 if (n_past > 0 && is_interacting && !params.interactive) { 553 LOG("waiting for user input\n"); 554 555 if (params.input_prefix_bos) { 556 LOG("adding input prefix BOS token\n"); 557 embd_inp.push_back(llama_token_bos(model)); 558 } 559 560 std::string buffer; 561 if (!params.input_prefix.empty()) { 562 LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); 563 buffer += params.input_prefix; 564 printf("%s", buffer.c_str()); 565 } 566 567 std::string line; 568 bool another_line = true; 569 do { 570 another_line = console::readline(line, params.multiline_input); 571 buffer += line; 572 } while (another_line); 573 574 // done taking input, reset color 575 console::set_display(console::reset); 576 577 // Add tokens to embd only if the input buffer is non-empty 578 // Entering a empty line lets the user pass control back 579 if (buffer.length() > 1) { 580 // append input suffix if any 581 if (!params.input_suffix.empty()) { 582 LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); 583 buffer += params.input_suffix; 584 printf("%s", params.input_suffix.c_str()); 585 } 586 587 LOG("buffer: '%s'\n", buffer.c_str()); 588 589 const size_t original_size = embd_inp.size(); 590 591 const auto line_inp = ::llama_tokenize(ctx, buffer, false); 592 LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); 593 594 embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); 595 596 for (size_t i = original_size; i < embd_inp.size(); ++i) { 597 const llama_token token = embd_inp[i]; 598 output_tokens.push_back(token); 599 output_ss << llama_token_to_piece(ctx, token); 600 } 601 602 n_remain -= line_inp.size(); 603 LOG("n_remain: %d\n", n_remain); 604 } else { 605 LOG("empty line, passing control back\n"); 606 } 607 608 input_echo = false; // do not echo this again 609 } 610 611 if (n_past > 0) { 612 if (is_interacting) { 613 llama_sampling_reset(ctx_sampling); 614 } 615 is_interacting = false; 616 } 617 } 618 619 // end of generation 620 if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) { 621 break; 622 } 623 624 // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. 625 // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). 626 if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { 627 n_remain = params.n_predict; 628 is_interacting = true; 629 } 630 } 631 if (!params.interactive && n_remain <= 0) { 632 printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); 633 fflush(stdout); 634 } 635 636 llama_print_timings(ctx); 637 write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); 638 639 llama_free(ctx); 640 llama_free_model(model); 641 642 llama_sampling_free(ctx_sampling); 643 llama_backend_free(); 644 645 #ifndef LOG_DISABLE_LOGS 646 LOG_TEE("Log end\n"); 647 #endif // LOG_DISABLE_LOGS 648 649 return 0; 650 } 651