Cradicle Explorer

infill.cpp
  1  #include "common.h"
  2  
  3  #include "console.h"
  4  #include "llama.h"
  5  #include "grammar-parser.h"
  6  
  7  #include <cassert>
  8  #include <cinttypes>
  9  #include <cmath>
 10  #include <cstdio>
 11  #include <cstring>
 12  #include <ctime>
 13  #include <fstream>
 14  #include <iostream>
 15  #include <sstream>
 16  #include <string>
 17  #include <vector>
 18  
 19  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 20  #include <signal.h>
 21  #include <unistd.h>
 22  #elif defined (_WIN32)
 23  #define WIN32_LEAN_AND_MEAN
 24  #ifndef NOMINMAX
 25  #define NOMINMAX
 26  #endif
 27  #include <windows.h>
 28  #include <signal.h>
 29  #endif
 30  
 31  #if defined(_MSC_VER)
 32  #pragma warning(disable: 4244 4267) // possible loss of data
 33  #endif
 34  
 35  static llama_context           ** g_ctx;
 36  static llama_model             ** g_model;
 37  static gpt_params               * g_params;
 38  static std::vector<llama_token> * g_input_tokens;
 39  static std::ostringstream       * g_output_ss;
 40  static std::vector<llama_token> * g_output_tokens;
 41  
 42  static bool is_interacting = false;
 43  
 44  static void write_logfile(
 45      const llama_context * ctx, const gpt_params & params, const llama_model * model,
 46      const std::vector<llama_token> & input_tokens, const std::string & output,
 47      const std::vector<llama_token> & output_tokens
 48  ) {
 49      if (params.logdir.empty()) {
 50          return;
 51      }
 52  
 53      const std::string timestamp = string_get_sortable_timestamp();
 54  
 55      const bool success = fs_create_directory_with_parents(params.logdir);
 56      if (!success) {
 57          fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
 58                  __func__, params.logdir.c_str());
 59          return;
 60      }
 61  
 62      const std::string logfile_path = params.logdir + timestamp + ".yml";
 63      FILE * logfile = fopen(logfile_path.c_str(), "w");
 64  
 65      if (logfile == NULL) {
 66          fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
 67          return;
 68      }
 69  
 70      fprintf(logfile, "binary: infill\n");
 71      char model_desc[128];
 72      llama_model_desc(model, model_desc, sizeof(model_desc));
 73      yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
 74  
 75      fprintf(logfile, "\n");
 76      fprintf(logfile, "######################\n");
 77      fprintf(logfile, "# Generation Results #\n");
 78      fprintf(logfile, "######################\n");
 79      fprintf(logfile, "\n");
 80  
 81      yaml_dump_string_multiline(logfile, "output", output.c_str());
 82      yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
 83  
 84      llama_dump_timing_info_yaml(logfile, ctx);
 85      fclose(logfile);
 86  }
 87  
 88  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 89  static void sigint_handler(int signo) {
 90      if (signo == SIGINT) {
 91          if (!is_interacting) {
 92              is_interacting = true;
 93          } else {
 94              console::cleanup();
 95              printf("\n");
 96              llama_print_timings(*g_ctx);
 97              write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
 98              _exit(130);
 99          }
100      }
101  }
102  #endif
103  
104  int main(int argc, char ** argv) {
105      gpt_params params;
106      llama_sampling_params & sparams = params.sparams;
107      g_params = &params;
108  
109      if (!gpt_params_parse(argc, argv, params)) {
110          gpt_params_print_usage(argc, argv, params);
111          return 1;
112      }
113  
114  #ifndef LOG_DISABLE_LOGS
115      log_set_target(log_filename_generator("infill", "log"));
116      LOG_TEE("Log start\n");
117      log_dump_cmdline(argc, argv);
118  #endif // LOG_DISABLE_LOGS
119  
120      console::init(params.simple_io, params.use_color);
121      atexit([]() { console::cleanup(); });
122  
123      if (params.logits_all) {
124          printf("\n************\n");
125          printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
126          printf("************\n\n");
127  
128          return 0;
129      }
130  
131      if (params.embedding) {
132          printf("\n************\n");
133          printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
134          printf("************\n\n");
135  
136          return 0;
137      }
138  
139      if (params.n_ctx != 0 && params.n_ctx < 8) {
140          LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
141          params.n_ctx = 8;
142      }
143      if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
144          printf("\n************\n");
145          printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
146          printf("************\n\n");
147  
148          return 0;
149      }
150  
151      if (params.rope_freq_base != 0.0) {
152          LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
153      }
154  
155      if (params.rope_freq_scale != 0.0) {
156          LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
157      }
158  
159      LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
160      LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
161  
162      if (params.seed == LLAMA_DEFAULT_SEED) {
163          params.seed = time(NULL);
164      }
165  
166      LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
167  
168      std::mt19937 rng(params.seed);
169  
170      LOG("%s: llama backend init\n", __func__);
171      llama_backend_init();
172      llama_numa_init(params.numa);
173  
174      llama_model * model;
175      llama_context * ctx;
176  
177      g_model = &model;
178      g_ctx = &ctx;
179  
180      // load the model and apply lora adapter, if any
181      LOG("%s: load the model and apply lora adapter, if any\n", __func__);
182      std::tie(model, ctx) = llama_init_from_gpt_params(params);
183  
184      if (model == NULL) {
185          LOG_TEE("%s: error: unable to load model\n", __func__);
186          return 1;
187      }
188  
189      const int n_ctx_train = llama_n_ctx_train(model);
190      const int n_ctx = llama_n_ctx(ctx);
191      LOG("n_ctx: %d\n", n_ctx);
192  
193      if (n_ctx > n_ctx_train) {
194          LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
195                  __func__, n_ctx_train, n_ctx);
196      }
197  
198      // print system information
199      {
200          LOG_TEE("\n");
201          LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
202      }
203      const bool add_bos = llama_should_add_bos_token(model);
204      GGML_ASSERT(llama_add_eos_token(model) != 1);
205      LOG("add_bos: %d\n", add_bos);
206  
207      bool suff_rm_leading_spc = params.escape;
208      if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
209          params.input_suffix.erase(0, 1);
210          suff_rm_leading_spc = false;
211      }
212      std::vector<llama_token> embd_inp;
213      std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
214      std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
215      const int space_token = 29871;
216      if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
217          inp_sfx.erase(inp_sfx.begin());
218      }
219      inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
220      if (add_bos) {
221          inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
222      }
223      inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
224      embd_inp = inp_pfx;
225      embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
226      embd_inp.push_back(llama_token_middle(model));
227  
228      LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
229      LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
230      LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
231  
232      // Should not run without any tokens
233      if (embd_inp.empty()) {
234          embd_inp.push_back(llama_token_bos(model));
235          LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
236      }
237  
238      if ((int) embd_inp.size() > n_ctx - 4) {
239          LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
240          return 1;
241      }
242  
243      // number of tokens to keep when resetting context
244      if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
245          params.n_keep = (int)embd_inp.size();
246      }
247  
248      LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
249      LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
250  
251  
252      // enable interactive mode if interactive start is specified
253      if (params.interactive_first) {
254          params.interactive = true;
255      }
256  
257      if (params.verbose_prompt) {
258          LOG_TEE("\n");
259          LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
260          LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
261          for (int i = 0; i < (int) embd_inp.size(); i++) {
262              LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
263          }
264  
265          if (params.n_keep > 0) {
266          LOG_TEE("%s: static prompt based on n_keep: '", __func__);
267              for (int i = 0; i < params.n_keep; i++) {
268                  LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
269              }
270              LOG_TEE("'\n");
271          }
272          LOG_TEE("\n");
273      }
274  
275      if (params.interactive) {
276  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
277          struct sigaction sigint_action;
278          sigint_action.sa_handler = sigint_handler;
279          sigemptyset (&sigint_action.sa_mask);
280          sigint_action.sa_flags = 0;
281          sigaction(SIGINT, &sigint_action, NULL);
282  #elif defined (_WIN32)
283          auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
284              return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
285          };
286          SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
287  #endif
288  
289          LOG_TEE("%s: interactive mode on.\n", __func__);
290  
291          if (params.input_prefix_bos) {
292              LOG_TEE("Input prefix with BOS\n");
293          }
294  
295          if (!params.input_prefix.empty()) {
296              LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
297          }
298  
299          if (!params.input_suffix.empty()) {
300              LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
301          }
302      }
303      LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
304      LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
305      LOG_TEE("\n\n");
306  
307      LOG_TEE("\n#####  Infill mode  #####\n\n");
308      if (params.infill) {
309          printf("\n************\n");
310          printf("no need to specify '--infill', always running infill\n");
311          printf("************\n\n");
312      }
313      if (params.interactive) {
314          const char *control_message;
315          if (params.multiline_input) {
316              control_message = " - To return control to LLaMA, end your input with '\\'.\n"
317                                " - To return control without starting a new line, end your input with '/'.\n";
318          } else {
319              control_message = " - Press Return to return control to LLaMA.\n"
320                                " - To return control without starting a new line, end your input with '/'.\n"
321                                " - If you want to submit another line, end your input with '\\'.\n";
322          }
323          LOG_TEE("== Running in interactive mode. ==\n");
324  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
325          LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
326  #endif
327          LOG_TEE(       "%s\n", control_message);
328  
329          is_interacting = params.interactive_first;
330      }
331  
332      bool input_echo = true;
333  
334      int n_past     = 0;
335      int n_remain   = params.n_predict;
336      int n_consumed = 0;
337  
338      std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
339      std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
340      std::ostringstream output_ss;     g_output_ss     = &output_ss;
341  
342      // the first thing we will do is to output the prompt, so set color accordingly
343      console::set_display(console::prompt);
344  
345      std::vector<llama_token> embd;
346  
347      struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
348  
349      while (n_remain != 0 || params.interactive) {
350          // predict
351          if (!embd.empty()) {
352              // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
353              // --prompt or --file which uses the same value.
354              int max_embd_size = n_ctx - 4;
355  
356              // Ensure the input doesn't exceed the context size by truncating embd if necessary.
357              if ((int) embd.size() > max_embd_size) {
358                  const int skipped_tokens = (int) embd.size() - max_embd_size;
359                  embd.resize(max_embd_size);
360  
361                  console::set_display(console::error);
362                  printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
363                  console::set_display(console::reset);
364                  fflush(stdout);
365              }
366  
367              // infinite text generation via context swapping
368              // if we run out of context:
369              // - take the n_keep first tokens from the original prompt (via n_past)
370              // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
371              if (n_past + (int) embd.size() > n_ctx) {
372                  if (params.n_predict == -2) {
373                      LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
374                      break;
375                  }
376  
377                  const int n_left    = n_past - params.n_keep - 1;
378                  const int n_discard = n_left/2;
379  
380                  LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
381                      n_past, n_left, n_ctx, params.n_keep, n_discard);
382  
383                  llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
384                  llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
385  
386                  n_past -= n_discard;
387  
388                  LOG("after swap: n_past = %d\n", n_past);
389  
390                  LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
391  
392              }
393  
394              // evaluate tokens in batches
395              // embd is typically prepared beforehand to fit within a batch, but not always
396              for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
397                  int n_eval = (int) embd.size() - i;
398                  if (n_eval > params.n_batch) {
399                      n_eval = params.n_batch;
400                  }
401  
402                  LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
403  
404                  if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
405                      LOG_TEE("%s : failed to eval\n", __func__);
406                      return 1;
407                  }
408  
409                  n_past += n_eval;
410  
411                  LOG("n_past = %d\n", n_past);
412              }
413  
414          }
415  
416          embd.clear();
417  
418          if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
419              const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
420  
421              llama_sampling_accept(ctx_sampling, ctx, id, true);
422  
423              LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
424  
425              embd.push_back(id);
426  
427              // echo this to console
428              input_echo = true;
429  
430              // decrement remaining sampling budget
431              --n_remain;
432  
433              LOG("n_remain: %d\n", n_remain);
434          } else {
435              // some user input remains from prompt or interaction, forward it to processing
436              LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
437              while ((int) embd_inp.size() > n_consumed) {
438                  embd.push_back(embd_inp[n_consumed]);
439  
440                  // push the prompt in the sampling context in order to apply repetition penalties later
441                  // for the prompt, we don't apply grammar rules
442                  llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
443  
444                  ++n_consumed;
445                  if ((int) embd.size() >= params.n_batch) {
446                      break;
447                  }
448              }
449          }
450  
451          // display text
452          if (input_echo) {
453              for (auto id : embd) {
454                  const std::string token_str = llama_token_to_piece(ctx, id);
455                  printf("%s", token_str.c_str());
456  
457                  if (embd.size() > 1) {
458                      input_tokens.push_back(id);
459                  } else {
460                      output_tokens.push_back(id);
461                      output_ss << token_str;
462                  }
463              }
464              fflush(stdout);
465          }
466          // reset color to default if we there is no pending user input
467          if (input_echo && (int) embd_inp.size() == n_consumed) {
468              console::set_display(console::reset);
469          }
470  
471          // if not currently processing queued inputs;
472          if ((int) embd_inp.size() <= n_consumed) {
473              // deal with eot token in infill mode
474              if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
475                  if (is_interacting && !params.interactive_first) {
476                      // print an eot token
477                      printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
478                  }
479                  fflush(stdout);
480                  printf("\n");
481                  console::set_display(console::user_input);
482                  std::string buffer;
483                  std::string line;
484                  bool another_line=true;
485                  // set a new prefix via stdin
486                  do {
487                      another_line = console::readline(line, params.multiline_input);
488                      buffer += line;
489                  } while (another_line);
490                  // check if we got an empty line, if so we use the old input
491                  if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
492                      params.input_prefix = buffer;
493                  }
494                  buffer.clear();
495                  // set a new suffix via stdin
496                  do {
497                      another_line = console::readline(line, params.multiline_input);
498                      buffer += line;
499                  } while (another_line);
500                  // check if we got an empty line
501                  if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
502                      params.input_suffix = buffer;
503                  }
504                  buffer.clear();
505                  // done taking input, reset color
506                  console::set_display(console::reset);
507  
508                  if (params.escape) {
509                      //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
510                      string_process_escapes(params.input_prefix);
511                      string_process_escapes(params.input_suffix);
512                  }
513                  suff_rm_leading_spc = params.escape;
514                  if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
515                      params.input_suffix.erase(0, 1);
516                      suff_rm_leading_spc = false;
517                  }
518                  // tokenize new prefix and suffix
519                  std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
520                  std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
521                  if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
522                      inp_sfx.erase(inp_sfx.begin());
523                  }
524                  inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
525                  if (add_bos) {
526                      inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
527                  }
528                  inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
529                  embd_inp = inp_pfx;
530                  embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
531                  embd_inp.push_back(llama_token_middle(model));
532                  embd.clear();
533                  n_remain = params.n_predict;
534                  n_past = 0;
535                  n_consumed = 0;
536                  // LOG_TEE("took new input\n");
537                  is_interacting = false;
538              }
539              // deal with end of generation tokens in interactive mode
540              else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
541                  LOG("found EOS token\n");
542  
543                  if (params.interactive) {
544  
545                      is_interacting = true;
546                      printf("\n");
547                      console::set_display(console::user_input);
548                      fflush(stdout);
549                 }
550              }
551  
552              if (n_past > 0 && is_interacting && !params.interactive) {
553                  LOG("waiting for user input\n");
554  
555                  if (params.input_prefix_bos) {
556                      LOG("adding input prefix BOS token\n");
557                      embd_inp.push_back(llama_token_bos(model));
558                  }
559  
560                  std::string buffer;
561                  if (!params.input_prefix.empty()) {
562                      LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
563                      buffer += params.input_prefix;
564                      printf("%s", buffer.c_str());
565                  }
566  
567                  std::string line;
568                  bool another_line = true;
569                  do {
570                      another_line = console::readline(line, params.multiline_input);
571                      buffer += line;
572                  } while (another_line);
573  
574                  // done taking input, reset color
575                  console::set_display(console::reset);
576  
577                  // Add tokens to embd only if the input buffer is non-empty
578                  // Entering a empty line lets the user pass control back
579                  if (buffer.length() > 1) {
580                      // append input suffix if any
581                      if (!params.input_suffix.empty()) {
582                          LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
583                          buffer += params.input_suffix;
584                          printf("%s", params.input_suffix.c_str());
585                      }
586  
587                      LOG("buffer: '%s'\n", buffer.c_str());
588  
589                      const size_t original_size = embd_inp.size();
590  
591                      const auto line_inp = ::llama_tokenize(ctx, buffer, false);
592                      LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
593  
594                      embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
595  
596                      for (size_t i = original_size; i < embd_inp.size(); ++i) {
597                          const llama_token token = embd_inp[i];
598                          output_tokens.push_back(token);
599                          output_ss << llama_token_to_piece(ctx, token);
600                      }
601  
602                      n_remain -= line_inp.size();
603                      LOG("n_remain: %d\n", n_remain);
604                  } else {
605                      LOG("empty line, passing control back\n");
606                  }
607  
608                  input_echo = false; // do not echo this again
609              }
610  
611              if (n_past > 0) {
612                  if (is_interacting) {
613                      llama_sampling_reset(ctx_sampling);
614                  }
615                  is_interacting = false;
616              }
617          }
618  
619          // end of generation
620          if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
621              break;
622          }
623  
624          // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
625          // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
626          if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
627              n_remain = params.n_predict;
628              is_interacting = true;
629          }
630      }
631      if (!params.interactive && n_remain <= 0) {
632          printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
633          fflush(stdout);
634      }
635  
636      llama_print_timings(ctx);
637      write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
638  
639      llama_free(ctx);
640      llama_free_model(model);
641  
642      llama_sampling_free(ctx_sampling);
643      llama_backend_free();
644  
645  #ifndef LOG_DISABLE_LOGS
646      LOG_TEE("Log end\n");
647  #endif // LOG_DISABLE_LOGS
648  
649      return 0;
650  }
651