/ examples / llama-bench / llama-bench.cpp
llama-bench.cpp
   1  #include <algorithm>
   2  #include <array>
   3  #include <cassert>
   4  #include <chrono>
   5  #include <cinttypes>
   6  #include <clocale>
   7  #include <cmath>
   8  #include <cstdio>
   9  #include <cstring>
  10  #include <ctime>
  11  #include <cstdlib>
  12  #include <iterator>
  13  #include <map>
  14  #include <numeric>
  15  #include <regex>
  16  #include <sstream>
  17  #include <string>
  18  #include <vector>
  19  
  20  #include "ggml.h"
  21  #include "llama.h"
  22  #include "common.h"
  23  #include "ggml-cuda.h"
  24  #include "ggml-sycl.h"
  25  
  26  // utils
  27  static uint64_t get_time_ns() {
  28      using clock = std::chrono::high_resolution_clock;
  29      return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
  30  }
  31  
  32  template<class T>
  33  static std::string join(const std::vector<T> & values, const std::string & delim) {
  34      std::ostringstream str;
  35      for (size_t i = 0; i < values.size(); i++) {
  36          str << values[i];
  37          if (i < values.size() - 1) {
  38              str << delim;
  39          }
  40      }
  41      return str.str();
  42  }
  43  
  44  template<typename T, typename F>
  45  static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
  46      std::vector<std::string> str_values;
  47      std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
  48      return str_values;
  49  }
  50  
  51  template<typename T>
  52  static T avg(const std::vector<T> & v) {
  53      if (v.empty()) {
  54          return 0;
  55      }
  56      T sum = std::accumulate(v.begin(), v.end(), T(0));
  57      return sum / (T)v.size();
  58  }
  59  
  60  template<typename T>
  61  static T stdev(const std::vector<T> & v) {
  62      if (v.size() <= 1) {
  63          return 0;
  64      }
  65      T mean = avg(v);
  66      T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
  67      T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
  68      return stdev;
  69  }
  70  
  71  static std::string get_cpu_info() {
  72      std::string id;
  73  #ifdef __linux__
  74      FILE * f = fopen("/proc/cpuinfo", "r");
  75      if (f) {
  76          char buf[1024];
  77          while (fgets(buf, sizeof(buf), f)) {
  78              if (strncmp(buf, "model name", 10) == 0) {
  79                  char * p = strchr(buf, ':');
  80                  if (p) {
  81                      p++;
  82                      while (std::isspace(*p)) {
  83                          p++;
  84                      }
  85                      while (std::isspace(p[strlen(p) - 1])) {
  86                          p[strlen(p) - 1] = '\0';
  87                      }
  88                      id = p;
  89                      break;
  90                  }
  91              }
  92          }
  93          fclose(f);
  94      }
  95  #endif
  96      // TODO: other platforms
  97      return id;
  98  }
  99  
 100  static std::string get_gpu_info() {
 101      std::string id;
 102  #ifdef GGML_USE_CUDA
 103      int count = ggml_backend_cuda_get_device_count();
 104      for (int i = 0; i < count; i++) {
 105          char buf[128];
 106          ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
 107          id += buf;
 108          if (i < count - 1) {
 109              id += "/";
 110          }
 111      }
 112  #endif
 113  #ifdef GGML_USE_SYCL
 114      int count = ggml_backend_sycl_get_device_count();
 115      for (int i = 0; i < count; i++) {
 116          char buf[128];
 117          ggml_sycl_get_device_description(i, buf, sizeof(buf));
 118          id += buf;
 119          if (i < count - 1) {
 120              id += "/";
 121          }
 122      }
 123  #endif
 124      // TODO: other backends
 125      return id;
 126  }
 127  
 128  // command line params
 129  enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
 130  
 131  static const char * output_format_str(output_formats format) {
 132      switch (format) {
 133          case NONE:     return "none";
 134          case CSV:      return "csv";
 135          case JSON:     return "json";
 136          case MARKDOWN: return "md";
 137          case SQL:      return "sql";
 138          default: GGML_ASSERT(!"invalid output format");
 139      }
 140  }
 141  
 142  static bool output_format_from_str(const std::string & s, output_formats & format) {
 143      if (s == "none") {
 144          format = NONE;
 145      } else if (s == "csv") {
 146          format = CSV;
 147      } else if (s == "json") {
 148          format = JSON;
 149      } else if (s == "md") {
 150          format = MARKDOWN;
 151      } else if (s == "sql") {
 152          format = SQL;
 153      } else {
 154          return false;
 155      }
 156      return true;
 157  }
 158  
 159  static const char * split_mode_str(llama_split_mode mode) {
 160      switch (mode) {
 161          case LLAMA_SPLIT_MODE_NONE:  return "none";
 162          case LLAMA_SPLIT_MODE_LAYER: return "layer";
 163          case LLAMA_SPLIT_MODE_ROW:   return "row";
 164          default: GGML_ASSERT(!"invalid split mode");
 165      }
 166  }
 167  
 168  static std::string pair_str(const std::pair<int, int> & p) {
 169      static char buf[32];
 170      snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
 171      return buf;
 172  }
 173  
 174  struct cmd_params {
 175      std::vector<std::string> model;
 176      std::vector<int> n_prompt;
 177      std::vector<int> n_gen;
 178      std::vector<std::pair<int, int>> n_pg;
 179      std::vector<int> n_batch;
 180      std::vector<int> n_ubatch;
 181      std::vector<ggml_type> type_k;
 182      std::vector<ggml_type> type_v;
 183      std::vector<int> n_threads;
 184      std::vector<int> n_gpu_layers;
 185      std::vector<std::string> rpc_servers;
 186      std::vector<llama_split_mode> split_mode;
 187      std::vector<int> main_gpu;
 188      std::vector<bool> no_kv_offload;
 189      std::vector<bool> flash_attn;
 190      std::vector<std::vector<float>> tensor_split;
 191      std::vector<bool> use_mmap;
 192      std::vector<bool> embeddings;
 193      ggml_numa_strategy numa;
 194      int reps;
 195      bool verbose;
 196      output_formats output_format;
 197      output_formats output_format_stderr;
 198  };
 199  
 200  static const cmd_params cmd_params_defaults = {
 201      /* model                */ {"models/7B/ggml-model-q4_0.gguf"},
 202      /* n_prompt             */ {512},
 203      /* n_gen                */ {128},
 204      /* n_pg                 */ {},
 205      /* n_batch              */ {2048},
 206      /* n_ubatch             */ {512},
 207      /* type_k               */ {GGML_TYPE_F16},
 208      /* type_v               */ {GGML_TYPE_F16},
 209      /* n_threads            */ {cpu_get_num_math()},
 210      /* n_gpu_layers         */ {99},
 211      /* rpc_servers          */ {""},
 212      /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
 213      /* main_gpu             */ {0},
 214      /* no_kv_offload        */ {false},
 215      /* flash_attn           */ {false},
 216      /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
 217      /* use_mmap             */ {true},
 218      /* embeddings           */ {false},
 219      /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
 220      /* reps                 */ 5,
 221      /* verbose              */ false,
 222      /* output_format        */ MARKDOWN,
 223      /* output_format_stderr */ NONE,
 224  };
 225  
 226  static void print_usage(int /* argc */, char ** argv) {
 227      printf("usage: %s [options]\n", argv[0]);
 228      printf("\n");
 229      printf("options:\n");
 230      printf("  -h, --help\n");
 231      printf("  -m, --model <filename>              (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
 232      printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
 233      printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
 234      printf("  -pg <pp,tg>                         (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
 235      printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
 236      printf("  -ub, --ubatch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
 237      printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
 238      printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
 239      printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
 240      printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
 241      printf("  -rpc, --rpc <rpc_servers>           (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
 242      printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
 243      printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
 244      printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
 245      printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
 246      printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
 247      printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
 248      printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
 249      printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
 250      printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
 251      printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
 252      printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
 253      printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
 254      printf("\n");
 255      printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
 256  }
 257  
 258  static ggml_type ggml_type_from_name(const std::string & s) {
 259      if (s == "f16") {
 260          return GGML_TYPE_F16;
 261      }
 262      if (s == "q8_0") {
 263          return GGML_TYPE_Q8_0;
 264      }
 265      if (s == "q4_0") {
 266          return GGML_TYPE_Q4_0;
 267      }
 268      if (s == "q4_1") {
 269          return GGML_TYPE_Q4_1;
 270      }
 271      if (s == "q5_0") {
 272          return GGML_TYPE_Q5_0;
 273      }
 274      if (s == "q5_1") {
 275          return GGML_TYPE_Q5_1;
 276      }
 277      if (s == "iq4_nl") {
 278          return GGML_TYPE_IQ4_NL;
 279      }
 280  
 281      return GGML_TYPE_COUNT;
 282  }
 283  
 284  
 285  static cmd_params parse_cmd_params(int argc, char ** argv) {
 286      cmd_params params;
 287      std::string arg;
 288      bool invalid_param = false;
 289      const std::string arg_prefix = "--";
 290      const char split_delim = ',';
 291  
 292      params.verbose = cmd_params_defaults.verbose;
 293      params.output_format = cmd_params_defaults.output_format;
 294      params.output_format_stderr = cmd_params_defaults.output_format_stderr;
 295      params.reps = cmd_params_defaults.reps;
 296      params.numa = cmd_params_defaults.numa;
 297  
 298      for (int i = 1; i < argc; i++) {
 299          arg = argv[i];
 300          if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
 301              std::replace(arg.begin(), arg.end(), '_', '-');
 302          }
 303  
 304          if (arg == "-h" || arg == "--help") {
 305              print_usage(argc, argv);
 306              exit(0);
 307          } else if (arg == "-m" || arg == "--model") {
 308              if (++i >= argc) {
 309                  invalid_param = true;
 310                  break;
 311              }
 312              auto p = string_split<std::string>(argv[i], split_delim);
 313              params.model.insert(params.model.end(), p.begin(), p.end());
 314          } else if (arg == "-p" || arg == "--n-prompt") {
 315              if (++i >= argc) {
 316                  invalid_param = true;
 317                  break;
 318              }
 319              auto p = string_split<int>(argv[i], split_delim);
 320              params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
 321          } else if (arg == "-n" || arg == "--n-gen") {
 322              if (++i >= argc) {
 323                  invalid_param = true;
 324                  break;
 325              }
 326              auto p = string_split<int>(argv[i], split_delim);
 327              params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
 328          } else if (arg == "-pg") {
 329              if (++i >= argc) {
 330                  invalid_param = true;
 331                  break;
 332              }
 333              auto p = string_split<std::string>(argv[i], ',');
 334              if (p.size() != 2) {
 335                  invalid_param = true;
 336                  break;
 337              }
 338              params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
 339          } else if (arg == "-b" || arg == "--batch-size") {
 340              if (++i >= argc) {
 341                  invalid_param = true;
 342                  break;
 343              }
 344              auto p = string_split<int>(argv[i], split_delim);
 345              params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
 346          } else if (arg == "-ub" || arg == "--ubatch-size") {
 347              if (++i >= argc) {
 348                  invalid_param = true;
 349                  break;
 350              }
 351              auto p = string_split<int>(argv[i], split_delim);
 352              params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
 353          } else if (arg == "-ctk" || arg == "--cache-type-k") {
 354              if (++i >= argc) {
 355                  invalid_param = true;
 356                  break;
 357              }
 358              auto p = string_split<std::string>(argv[i], split_delim);
 359              std::vector<ggml_type> types;
 360              for (const auto & t : p) {
 361                  ggml_type gt = ggml_type_from_name(t);
 362                  if (gt == GGML_TYPE_COUNT) {
 363                      invalid_param = true;
 364                      break;
 365                  }
 366                  types.push_back(gt);
 367              }
 368              params.type_k.insert(params.type_k.end(), types.begin(), types.end());
 369          } else if (arg == "-ctv" || arg == "--cache-type-v") {
 370              if (++i >= argc) {
 371                  invalid_param = true;
 372                  break;
 373              }
 374              auto p = string_split<std::string>(argv[i], split_delim);
 375              std::vector<ggml_type> types;
 376              for (const auto & t : p) {
 377                  ggml_type gt = ggml_type_from_name(t);
 378                  if (gt == GGML_TYPE_COUNT) {
 379                      invalid_param = true;
 380                      break;
 381                  }
 382                  types.push_back(gt);
 383              }
 384              params.type_v.insert(params.type_v.end(), types.begin(), types.end());
 385          } else if (arg == "-t" || arg == "--threads") {
 386              if (++i >= argc) {
 387                  invalid_param = true;
 388                  break;
 389              }
 390              auto p = string_split<int>(argv[i], split_delim);
 391              params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
 392          } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
 393              if (++i >= argc) {
 394                  invalid_param = true;
 395                  break;
 396              }
 397              auto p = string_split<int>(argv[i], split_delim);
 398              params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
 399          } else if (arg == "-rpc" || arg == "--rpc") {
 400              if (++i >= argc) {
 401                  invalid_param = true;
 402                  break;
 403              }
 404              params.rpc_servers.push_back(argv[i]);
 405          } else if (arg == "-sm" || arg == "--split-mode") {
 406              if (++i >= argc) {
 407                  invalid_param = true;
 408                  break;
 409              }
 410              auto p = string_split<std::string>(argv[i], split_delim);
 411              std::vector<llama_split_mode> modes;
 412              for (const auto & m : p) {
 413                  llama_split_mode mode;
 414                  if (m == "none") {
 415                      mode = LLAMA_SPLIT_MODE_NONE;
 416                  } else if (m == "layer") {
 417                      mode = LLAMA_SPLIT_MODE_LAYER;
 418                  } else if (m == "row") {
 419                      mode = LLAMA_SPLIT_MODE_ROW;
 420                  } else {
 421                      invalid_param = true;
 422                      break;
 423                  }
 424                  modes.push_back(mode);
 425              }
 426              params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
 427          } else if (arg == "-mg" || arg == "--main-gpu") {
 428              if (++i >= argc) {
 429                  invalid_param = true;
 430                  break;
 431              }
 432              params.main_gpu = string_split<int>(argv[i], split_delim);
 433          } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
 434              if (++i >= argc) {
 435                  invalid_param = true;
 436                  break;
 437              }
 438              auto p = string_split<bool>(argv[i], split_delim);
 439              params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
 440          } else if (arg == "--numa") {
 441              if (++i >= argc) {
 442                  invalid_param = true;
 443                  break;
 444              } else {
 445                  std::string value(argv[i]);
 446                  /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
 447                  else if (value == "isolate")                    { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
 448                  else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
 449                  else { invalid_param = true; break; }
 450              }
 451          } else if (arg == "-fa" || arg == "--flash-attn") {
 452              if (++i >= argc) {
 453                  invalid_param = true;
 454                  break;
 455              }
 456              auto p = string_split<bool>(argv[i], split_delim);
 457              params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
 458          } else if (arg == "-mmp" || arg == "--mmap") {
 459              if (++i >= argc) {
 460                  invalid_param = true;
 461                  break;
 462              }
 463              auto p = string_split<bool>(argv[i], split_delim);
 464              params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
 465          } else if (arg == "-embd" || arg == "--embeddings") {
 466              if (++i >= argc) {
 467                  invalid_param = true;
 468                  break;
 469              }
 470              auto p = string_split<bool>(argv[i], split_delim);
 471              params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
 472          } else if (arg == "-ts" || arg == "--tensor-split") {
 473              if (++i >= argc) {
 474                  invalid_param = true;
 475                  break;
 476              }
 477              for (auto ts : string_split<std::string>(argv[i], split_delim)) {
 478                  // split string by ; and /
 479                  const std::regex regex{R"([;/]+)"};
 480                  std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
 481                  std::vector<std::string> split_arg{it, {}};
 482                  GGML_ASSERT(split_arg.size() <= llama_max_devices());
 483  
 484                  std::vector<float> tensor_split(llama_max_devices());
 485                  for (size_t i = 0; i < llama_max_devices(); ++i) {
 486                      if (i < split_arg.size()) {
 487                          tensor_split[i] = std::stof(split_arg[i]);
 488                      } else {
 489                          tensor_split[i] = 0.0f;
 490                      }
 491                  }
 492                  params.tensor_split.push_back(tensor_split);
 493              }
 494          } else if (arg == "-r" || arg == "--repetitions") {
 495              if (++i >= argc) {
 496                  invalid_param = true;
 497                  break;
 498              }
 499              params.reps = std::stoi(argv[i]);
 500          } else if (arg == "-o" || arg == "--output") {
 501              if (++i >= argc) {
 502                  invalid_param = true;
 503                  break;
 504              }
 505              invalid_param = !output_format_from_str(argv[i], params.output_format);
 506          } else if (arg == "-oe" || arg == "--output-err") {
 507              if (++i >= argc) {
 508                  invalid_param = true;
 509                  break;
 510              }
 511              invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
 512          } else if (arg == "-v" || arg == "--verbose") {
 513              params.verbose = true;
 514          } else {
 515              invalid_param = true;
 516              break;
 517          }
 518      }
 519      if (invalid_param) {
 520          fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
 521          print_usage(argc, argv);
 522          exit(1);
 523      }
 524  
 525      // set defaults
 526      if (params.model.empty())        { params.model = cmd_params_defaults.model; }
 527      if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
 528      if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
 529      if (params.n_pg.empty())         { params.n_pg = cmd_params_defaults.n_pg; }
 530      if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
 531      if (params.n_ubatch.empty())     { params.n_ubatch = cmd_params_defaults.n_ubatch; }
 532      if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
 533      if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
 534      if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
 535      if (params.rpc_servers.empty())  { params.rpc_servers = cmd_params_defaults.rpc_servers; }
 536      if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
 537      if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
 538      if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
 539      if (params.flash_attn.empty())   { params.flash_attn = cmd_params_defaults.flash_attn; }
 540      if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
 541      if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
 542      if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
 543      if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
 544  
 545      return params;
 546  }
 547  
 548  struct cmd_params_instance {
 549      std::string model;
 550      int n_prompt;
 551      int n_gen;
 552      int n_batch;
 553      int n_ubatch;
 554      ggml_type type_k;
 555      ggml_type type_v;
 556      int n_threads;
 557      int n_gpu_layers;
 558      std::string rpc_servers;
 559      llama_split_mode split_mode;
 560      int main_gpu;
 561      bool no_kv_offload;
 562      bool flash_attn;
 563      std::vector<float> tensor_split;
 564      bool use_mmap;
 565      bool embeddings;
 566  
 567      llama_model_params to_llama_mparams() const {
 568          llama_model_params mparams = llama_model_default_params();
 569  
 570          mparams.n_gpu_layers = n_gpu_layers;
 571          if (!rpc_servers.empty()) {
 572              mparams.rpc_servers = rpc_servers.c_str();
 573          }
 574          mparams.split_mode = split_mode;
 575          mparams.main_gpu = main_gpu;
 576          mparams.tensor_split = tensor_split.data();
 577          mparams.use_mmap = use_mmap;
 578  
 579          return mparams;
 580      }
 581  
 582      bool equal_mparams(const cmd_params_instance & other) const {
 583          return model == other.model &&
 584                 n_gpu_layers == other.n_gpu_layers &&
 585                 rpc_servers == other.rpc_servers &&
 586                 split_mode == other.split_mode &&
 587                 main_gpu == other.main_gpu &&
 588                 use_mmap == other.use_mmap &&
 589                 tensor_split == other.tensor_split;
 590      }
 591  
 592      llama_context_params to_llama_cparams() const {
 593          llama_context_params cparams = llama_context_default_params();
 594  
 595          cparams.n_ctx = n_prompt + n_gen;
 596          cparams.n_batch = n_batch;
 597          cparams.n_ubatch = n_ubatch;
 598          cparams.type_k = type_k;
 599          cparams.type_v = type_v;
 600          cparams.offload_kqv = !no_kv_offload;
 601          cparams.flash_attn = flash_attn;
 602          cparams.embeddings = embeddings;
 603  
 604          return cparams;
 605      }
 606  };
 607  
 608  static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
 609      std::vector<cmd_params_instance> instances;
 610  
 611      // this ordering minimizes the number of times that each model needs to be reloaded
 612      for (const auto & m : params.model)
 613      for (const auto & nl : params.n_gpu_layers)
 614      for (const auto & rpc : params.rpc_servers)
 615      for (const auto & sm : params.split_mode)
 616      for (const auto & mg : params.main_gpu)
 617      for (const auto & ts : params.tensor_split)
 618      for (const auto & mmp : params.use_mmap)
 619      for (const auto & embd : params.embeddings)
 620      for (const auto & nb : params.n_batch)
 621      for (const auto & nub : params.n_ubatch)
 622      for (const auto & tk : params.type_k)
 623      for (const auto & tv : params.type_v)
 624      for (const auto & nkvo : params.no_kv_offload)
 625      for (const auto & fa : params.flash_attn)
 626      for (const auto & nt : params.n_threads) {
 627          for (const auto & n_prompt : params.n_prompt) {
 628              if (n_prompt == 0) {
 629                  continue;
 630              }
 631              cmd_params_instance instance = {
 632                  /* .model        = */ m,
 633                  /* .n_prompt     = */ n_prompt,
 634                  /* .n_gen        = */ 0,
 635                  /* .n_batch      = */ nb,
 636                  /* .n_ubatch     = */ nub,
 637                  /* .type_k       = */ tk,
 638                  /* .type_v       = */ tv,
 639                  /* .n_threads    = */ nt,
 640                  /* .n_gpu_layers = */ nl,
 641                  /* .rpc_servers  = */ rpc,
 642                  /* .split_mode   = */ sm,
 643                  /* .main_gpu     = */ mg,
 644                  /* .no_kv_offload= */ nkvo,
 645                  /* .flash_attn   = */ fa,
 646                  /* .tensor_split = */ ts,
 647                  /* .use_mmap     = */ mmp,
 648                  /* .embeddings   = */ embd,
 649              };
 650              instances.push_back(instance);
 651          }
 652  
 653          for (const auto & n_gen : params.n_gen) {
 654              if (n_gen == 0) {
 655                  continue;
 656              }
 657              cmd_params_instance instance = {
 658                  /* .model        = */ m,
 659                  /* .n_prompt     = */ 0,
 660                  /* .n_gen        = */ n_gen,
 661                  /* .n_batch      = */ nb,
 662                  /* .n_ubatch     = */ nub,
 663                  /* .type_k       = */ tk,
 664                  /* .type_v       = */ tv,
 665                  /* .n_threads    = */ nt,
 666                  /* .n_gpu_layers = */ nl,
 667                  /* .rpc_servers  = */ rpc,
 668                  /* .split_mode   = */ sm,
 669                  /* .main_gpu     = */ mg,
 670                  /* .no_kv_offload= */ nkvo,
 671                  /* .flash_attn   = */ fa,
 672                  /* .tensor_split = */ ts,
 673                  /* .use_mmap     = */ mmp,
 674                  /* .embeddings   = */ embd,
 675              };
 676              instances.push_back(instance);
 677          }
 678  
 679          for (const auto & n_pg : params.n_pg) {
 680              if (n_pg.first == 0 && n_pg.second == 0) {
 681                  continue;
 682              }
 683              cmd_params_instance instance = {
 684                  /* .model        = */ m,
 685                  /* .n_prompt     = */ n_pg.first,
 686                  /* .n_gen        = */ n_pg.second,
 687                  /* .n_batch      = */ nb,
 688                  /* .n_ubatch     = */ nub,
 689                  /* .type_k       = */ tk,
 690                  /* .type_v       = */ tv,
 691                  /* .n_threads    = */ nt,
 692                  /* .n_gpu_layers = */ nl,
 693                  /* .rpc_servers  = */ rpc,
 694                  /* .split_mode   = */ sm,
 695                  /* .main_gpu     = */ mg,
 696                  /* .no_kv_offload= */ nkvo,
 697                  /* .flash_attn   = */ fa,
 698                  /* .tensor_split = */ ts,
 699                  /* .use_mmap     = */ mmp,
 700                  /* .embeddings   = */ embd,
 701              };
 702              instances.push_back(instance);
 703          }
 704      }
 705  
 706      return instances;
 707  }
 708  
 709  struct test {
 710      static const std::string build_commit;
 711      static const int build_number;
 712      static const bool cuda;
 713      static const bool vulkan;
 714      static const bool kompute;
 715      static const bool metal;
 716      static const bool sycl;
 717      static const bool gpu_blas;
 718      static const bool blas;
 719      static const std::string cpu_info;
 720      static const std::string gpu_info;
 721      std::string model_filename;
 722      std::string model_type;
 723      uint64_t model_size;
 724      uint64_t model_n_params;
 725      int n_batch;
 726      int n_ubatch;
 727      int n_threads;
 728      bool has_rpc;
 729      ggml_type type_k;
 730      ggml_type type_v;
 731      int n_gpu_layers;
 732      llama_split_mode split_mode;
 733      int main_gpu;
 734      bool no_kv_offload;
 735      bool flash_attn;
 736      std::vector<float> tensor_split;
 737      bool use_mmap;
 738      bool embeddings;
 739      int n_prompt;
 740      int n_gen;
 741      std::string test_time;
 742      std::vector<uint64_t> samples_ns;
 743  
 744      test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
 745          model_filename = inst.model;
 746          char buf[128];
 747          llama_model_desc(lmodel, buf, sizeof(buf));
 748          model_type = buf;
 749          model_size = llama_model_size(lmodel);
 750          model_n_params = llama_model_n_params(lmodel);
 751          n_batch = inst.n_batch;
 752          n_ubatch = inst.n_ubatch;
 753          n_threads = inst.n_threads;
 754          has_rpc = !inst.rpc_servers.empty();
 755          type_k = inst.type_k;
 756          type_v = inst.type_v;
 757          n_gpu_layers = inst.n_gpu_layers;
 758          split_mode = inst.split_mode;
 759          main_gpu = inst.main_gpu;
 760          no_kv_offload = inst.no_kv_offload;
 761          flash_attn = inst.flash_attn;
 762          tensor_split = inst.tensor_split;
 763          use_mmap = inst.use_mmap;
 764          embeddings = inst.embeddings;
 765          n_prompt = inst.n_prompt;
 766          n_gen = inst.n_gen;
 767          // RFC 3339 date-time format
 768          time_t t = time(NULL);
 769          std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
 770          test_time = buf;
 771  
 772          (void) ctx;
 773      }
 774  
 775      uint64_t avg_ns() const {
 776          return ::avg(samples_ns);
 777      }
 778  
 779      uint64_t stdev_ns() const {
 780          return ::stdev(samples_ns);
 781      }
 782  
 783      std::vector<double> get_ts() const {
 784          int n_tokens = n_prompt + n_gen;
 785          std::vector<double> ts;
 786          std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
 787          return ts;
 788      }
 789  
 790      double avg_ts() const {
 791          return ::avg(get_ts());
 792      }
 793  
 794      double stdev_ts() const {
 795          return ::stdev(get_ts());
 796      }
 797  
 798      static std::string get_backend() {
 799          if (cuda) {
 800              return GGML_CUDA_NAME;
 801          }
 802          if (vulkan) {
 803              return "Vulkan";
 804          }
 805          if (kompute) {
 806              return "Kompute";
 807          }
 808          if (metal) {
 809              return "Metal";
 810          }
 811          if (sycl) {
 812              return GGML_SYCL_NAME;
 813          }
 814          if (gpu_blas) {
 815              return "GPU BLAS";
 816          }
 817          if (blas) {
 818              return "BLAS";
 819          }
 820  
 821          return "CPU";
 822      }
 823  
 824      static const std::vector<std::string> & get_fields() {
 825          static const std::vector<std::string> fields = {
 826              "build_commit", "build_number",
 827              "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
 828              "cpu_info", "gpu_info",
 829              "model_filename", "model_type", "model_size", "model_n_params",
 830              "n_batch", "n_ubatch",
 831              "n_threads", "type_k", "type_v",
 832              "n_gpu_layers", "split_mode",
 833              "main_gpu", "no_kv_offload", "flash_attn",
 834              "tensor_split", "use_mmap", "embeddings",
 835              "n_prompt", "n_gen", "test_time",
 836              "avg_ns", "stddev_ns",
 837              "avg_ts", "stddev_ts"
 838          };
 839          return fields;
 840      }
 841  
 842      enum field_type {STRING, BOOL, INT, FLOAT};
 843  
 844      static field_type get_field_type(const std::string & field) {
 845          if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
 846              field == "n_threads" ||
 847              field == "model_size" || field == "model_n_params" ||
 848              field == "n_gpu_layers" || field == "main_gpu" ||
 849              field == "n_prompt" || field == "n_gen" ||
 850              field == "avg_ns" || field == "stddev_ns") {
 851              return INT;
 852          }
 853          if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
 854              field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
 855              field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
 856              return BOOL;
 857          }
 858          if (field == "avg_ts" || field == "stddev_ts") {
 859              return FLOAT;
 860          }
 861          return STRING;
 862      }
 863  
 864      std::vector<std::string> get_values() const {
 865          std::string tensor_split_str;
 866          int max_nonzero = 0;
 867          for (size_t i = 0; i < llama_max_devices(); i++) {
 868              if (tensor_split[i] > 0) {
 869                  max_nonzero = i;
 870              }
 871          }
 872          for (int i = 0; i <= max_nonzero; i++) {
 873              char buf[32];
 874              snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
 875              tensor_split_str += buf;
 876              if (i < max_nonzero) {
 877                  tensor_split_str += "/";
 878              }
 879          }
 880          std::vector<std::string> values = {
 881              build_commit, std::to_string(build_number),
 882              std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
 883              std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
 884              cpu_info, gpu_info,
 885              model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
 886              std::to_string(n_batch), std::to_string(n_ubatch),
 887              std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
 888              std::to_string(n_gpu_layers), split_mode_str(split_mode),
 889              std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
 890              tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
 891              std::to_string(n_prompt), std::to_string(n_gen), test_time,
 892              std::to_string(avg_ns()), std::to_string(stdev_ns()),
 893              std::to_string(avg_ts()), std::to_string(stdev_ts())
 894          };
 895          return values;
 896      }
 897  
 898      std::map<std::string, std::string> get_map() const {
 899          std::map<std::string, std::string> map;
 900          auto fields = get_fields();
 901          auto values = get_values();
 902          std::transform(fields.begin(), fields.end(), values.begin(),
 903                  std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
 904          return map;
 905      }
 906  };
 907  
 908  const std::string test::build_commit = LLAMA_COMMIT;
 909  const int         test::build_number = LLAMA_BUILD_NUMBER;
 910  const bool        test::cuda         = !!ggml_cpu_has_cuda();
 911  const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 912  const bool        test::kompute      = !!ggml_cpu_has_kompute();
 913  const bool        test::metal        = !!ggml_cpu_has_metal();
 914  const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 915  const bool        test::blas         = !!ggml_cpu_has_blas();
 916  const bool        test::sycl         = !!ggml_cpu_has_sycl();
 917  const std::string test::cpu_info     = get_cpu_info();
 918  const std::string test::gpu_info     = get_gpu_info();
 919  
 920  struct printer {
 921      virtual ~printer() {}
 922  
 923      FILE * fout;
 924      virtual void print_header(const cmd_params & params) { (void) params; }
 925      virtual void print_test(const test & t) = 0;
 926      virtual void print_footer() { }
 927  };
 928  
 929  struct csv_printer : public printer {
 930      static std::string escape_csv(const std::string & field) {
 931          std::string escaped = "\"";
 932          for (auto c : field) {
 933              if (c == '"') {
 934                  escaped += "\"";
 935              }
 936              escaped += c;
 937          }
 938          escaped += "\"";
 939          return escaped;
 940      }
 941  
 942      void print_header(const cmd_params & params) override  {
 943          std::vector<std::string> fields = test::get_fields();
 944          fprintf(fout, "%s\n", join(fields, ",").c_str());
 945          (void) params;
 946      }
 947  
 948      void print_test(const test & t) override {
 949          std::vector<std::string> values = t.get_values();
 950          std::transform(values.begin(), values.end(), values.begin(), escape_csv);
 951          fprintf(fout, "%s\n", join(values, ",").c_str());
 952      }
 953  };
 954  
 955  struct json_printer : public printer {
 956      bool first = true;
 957  
 958      static std::string escape_json(const std::string & value) {
 959          std::string escaped;
 960          for (auto c : value) {
 961              if (c == '"') {
 962                  escaped += "\\\"";
 963              } else if (c == '\\') {
 964                  escaped += "\\\\";
 965              } else  if (c <= 0x1f) {
 966                  char buf[8];
 967                  snprintf(buf, sizeof(buf), "\\u%04x", c);
 968                  escaped += buf;
 969              } else {
 970                  escaped += c;
 971              }
 972          }
 973          return escaped;
 974      }
 975  
 976      static std::string format_value(const std::string & field, const std::string & value) {
 977          switch (test::get_field_type(field)) {
 978              case test::STRING:
 979                  return "\"" + escape_json(value) + "\"";
 980              case test::BOOL:
 981                  return value == "0" ? "false" : "true";
 982              default:
 983                  return value;
 984          }
 985      }
 986  
 987      void print_header(const cmd_params & params) override {
 988          fprintf(fout, "[\n");
 989          (void) params;
 990      }
 991  
 992      void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
 993          assert(fields.size() == values.size());
 994          for (size_t i = 0; i < fields.size(); i++) {
 995              fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
 996          }
 997      }
 998  
 999      void print_test(const test & t) override {
1000          if (first) {
1001              first = false;
1002          } else {
1003              fprintf(fout, ",\n");
1004          }
1005          fprintf(fout, "  {\n");
1006          print_fields(test::get_fields(), t.get_values());
1007          fprintf(fout, "    \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
1008          fprintf(fout, "    \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
1009          fprintf(fout, "  }");
1010          fflush(fout);
1011      }
1012  
1013      void print_footer() override {
1014          fprintf(fout, "\n]\n");
1015      }
1016  };
1017  
1018  struct markdown_printer : public printer {
1019      std::vector<std::string> fields;
1020  
1021      static int get_field_width(const std::string & field) {
1022          if (field == "model") {
1023              return -30;
1024          }
1025          if (field == "t/s") {
1026              return 16;
1027          }
1028          if (field == "size" || field == "params") {
1029              return 10;
1030          }
1031          if (field == "n_gpu_layers") {
1032              return 3;
1033          }
1034          if (field == "n_threads") {
1035              return 7;
1036          }
1037          if (field == "n_batch") {
1038              return 7;
1039          }
1040          if (field == "n_ubatch") {
1041              return 8;
1042          }
1043          if (field == "type_k" || field == "type_v") {
1044              return 6;
1045          }
1046          if (field == "split_mode") {
1047              return 5;
1048          }
1049          if (field == "flash_attn") {
1050              return 2;
1051          }
1052          if (field == "use_mmap") {
1053              return 4;
1054          }
1055          if (field == "test") {
1056              return 13;
1057          }
1058  
1059          int width = std::max((int)field.length(), 10);
1060  
1061          if (test::get_field_type(field) == test::STRING) {
1062              return -width;
1063          }
1064          return width;
1065      }
1066  
1067      static std::string get_field_display_name(const std::string & field) {
1068          if (field == "n_gpu_layers") {
1069              return "ngl";
1070          }
1071          if (field == "split_mode") {
1072              return "sm";
1073          }
1074          if (field == "n_threads") {
1075              return "threads";
1076          }
1077          if (field == "no_kv_offload") {
1078              return "nkvo";
1079          }
1080          if (field == "flash_attn") {
1081              return "fa";
1082          }
1083          if (field == "use_mmap") {
1084              return "mmap";
1085          }
1086          if (field == "embeddings") {
1087              return "embd";
1088          }
1089          if (field == "tensor_split") {
1090              return "ts";
1091          }
1092          return field;
1093      }
1094  
1095      void print_header(const cmd_params & params) override {
1096          // select fields to print
1097          fields.emplace_back("model");
1098          fields.emplace_back("size");
1099          fields.emplace_back("params");
1100          fields.emplace_back("backend");
1101          bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
1102          if (!is_cpu_backend) {
1103              fields.emplace_back("n_gpu_layers");
1104          }
1105          if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
1106              fields.emplace_back("n_threads");
1107          }
1108          if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
1109              fields.emplace_back("n_batch");
1110          }
1111          if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
1112              fields.emplace_back("n_ubatch");
1113          }
1114          if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
1115              fields.emplace_back("type_k");
1116          }
1117          if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
1118              fields.emplace_back("type_v");
1119          }
1120          if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
1121              fields.emplace_back("main_gpu");
1122          }
1123          if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
1124              fields.emplace_back("split_mode");
1125          }
1126          if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
1127              fields.emplace_back("no_kv_offload");
1128          }
1129          if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
1130              fields.emplace_back("flash_attn");
1131          }
1132          if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
1133              fields.emplace_back("tensor_split");
1134          }
1135          if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
1136              fields.emplace_back("use_mmap");
1137          }
1138          if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
1139              fields.emplace_back("embeddings");
1140          }
1141          fields.emplace_back("test");
1142          fields.emplace_back("t/s");
1143  
1144          fprintf(fout, "|");
1145          for (const auto & field : fields) {
1146              fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
1147          }
1148          fprintf(fout, "\n");
1149          fprintf(fout, "|");
1150          for (const auto & field : fields) {
1151              int width = get_field_width(field);
1152              fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
1153          }
1154          fprintf(fout, "\n");
1155      }
1156  
1157      void print_test(const test & t) override {
1158          std::map<std::string, std::string> vmap = t.get_map();
1159  
1160          fprintf(fout, "|");
1161          for (const auto & field : fields) {
1162              std::string value;
1163              char buf[128];
1164              if (field == "model") {
1165                  value = t.model_type;
1166              } else if (field == "size") {
1167                  if (t.model_size < 1024*1024*1024) {
1168                      snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
1169                  } else {
1170                      snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
1171                  }
1172                  value = buf;
1173              } else if (field == "params") {
1174                  if (t.model_n_params < 1000*1000*1000) {
1175                      snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
1176                  } else {
1177                      snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
1178                  }
1179                  value = buf;
1180              } else if (field == "backend") {
1181                  value = test::get_backend();
1182                  if (t.has_rpc) {
1183                      value += "+RPC";
1184                  }
1185              } else if (field == "test") {
1186                  if (t.n_prompt > 0 && t.n_gen == 0) {
1187                      snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
1188                  } else if (t.n_gen > 0 && t.n_prompt == 0) {
1189                      snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
1190                  } else {
1191                      snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1192                  }
1193                  value = buf;
1194              } else if (field == "t/s") {
1195                  snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
1196                  value = buf;
1197              } else if (vmap.find(field) != vmap.end()) {
1198                  value = vmap.at(field);
1199              } else {
1200                  assert(false);
1201                  exit(1);
1202              }
1203  
1204              int width = get_field_width(field);
1205              if (field == "t/s") {
1206                  // HACK: the utf-8 character is 2 bytes
1207                  width += 1;
1208              }
1209              fprintf(fout, " %*s |", width, value.c_str());
1210          }
1211          fprintf(fout, "\n");
1212      }
1213  
1214      void print_footer() override {
1215          fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
1216      }
1217  };
1218  
1219  struct sql_printer : public printer {
1220      static std::string get_sql_field_type(const std::string & field) {
1221          switch (test::get_field_type(field)) {
1222              case test::STRING:
1223                  return "TEXT";
1224              case test::BOOL:
1225              case test::INT:
1226                  return "INTEGER";
1227              case test::FLOAT:
1228                  return "REAL";
1229              default:
1230                  assert(false);
1231                  exit(1);
1232          }
1233      }
1234  
1235      void print_header(const cmd_params & params) override {
1236          std::vector<std::string> fields = test::get_fields();
1237          fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
1238          for (size_t i = 0; i < fields.size(); i++) {
1239              fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),  i < fields.size() - 1 ? "," : "");
1240          }
1241          fprintf(fout, ");\n");
1242          fprintf(fout, "\n");
1243          (void) params;
1244      }
1245  
1246      void print_test(const test & t) override {
1247          fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
1248          fprintf(fout, "VALUES (");
1249          std::vector<std::string> values = t.get_values();
1250          for (size_t i = 0; i < values.size(); i++) {
1251              fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
1252          }
1253          fprintf(fout, ");\n");
1254      }
1255  };
1256  
1257  static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
1258      llama_set_n_threads(ctx, n_threads, n_threads);
1259  
1260      const llama_model * model = llama_get_model(ctx);
1261      const int32_t n_vocab = llama_n_vocab(model);
1262  
1263      std::vector<llama_token> tokens(n_batch);
1264  
1265      int n_processed = 0;
1266  
1267      while (n_processed < n_prompt) {
1268          int n_tokens = std::min(n_prompt - n_processed, n_batch);
1269          tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1270          for (int i = 1; i < n_tokens; i++) {
1271              tokens[i] = std::rand() % n_vocab;
1272          }
1273          llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
1274          n_processed += n_tokens;
1275      }
1276  
1277      llama_synchronize(ctx);
1278  }
1279  
1280  static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
1281      llama_set_n_threads(ctx, n_threads, n_threads);
1282  
1283      const llama_model * model = llama_get_model(ctx);
1284      const int32_t n_vocab = llama_n_vocab(model);
1285  
1286      llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
1287  
1288      for (int i = 0; i < n_gen; i++) {
1289          llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
1290          llama_synchronize(ctx);
1291          token = std::rand() % n_vocab;
1292      }
1293  }
1294  
1295  static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
1296      (void) level;
1297      (void) text;
1298      (void) user_data;
1299  }
1300  
1301  static std::unique_ptr<printer> create_printer(output_formats format) {
1302      switch (format) {
1303          case NONE:
1304              return nullptr;
1305          case CSV:
1306              return std::unique_ptr<printer>(new csv_printer());
1307          case JSON:
1308              return std::unique_ptr<printer>(new json_printer());
1309          case MARKDOWN:
1310              return std::unique_ptr<printer>(new markdown_printer());
1311          case SQL:
1312              return std::unique_ptr<printer>(new sql_printer());
1313      }
1314      GGML_ASSERT(false);
1315  }
1316  
1317  int main(int argc, char ** argv) {
1318      // try to set locale for unicode characters in markdown
1319      setlocale(LC_CTYPE, ".UTF-8");
1320  
1321  #if !defined(NDEBUG)
1322      fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
1323  #endif
1324  
1325  #if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
1326      fprintf(stderr, "warning: debug build, performance may be affected\n");
1327  #endif
1328  
1329  #if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
1330      fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
1331  #endif
1332  
1333      cmd_params params = parse_cmd_params(argc, argv);
1334  
1335      // initialize llama.cpp
1336      if (!params.verbose) {
1337          llama_log_set(llama_null_log_callback, NULL);
1338      }
1339      llama_backend_init();
1340      llama_numa_init(params.numa);
1341  
1342      // initialize printer
1343      std::unique_ptr<printer> p = create_printer(params.output_format);
1344      std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
1345  
1346      if (p) {
1347          p->fout = stdout;
1348          p->print_header(params);
1349      }
1350  
1351      if (p_err) {
1352          p_err->fout = stderr;
1353          p_err->print_header(params);
1354      }
1355  
1356      std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
1357  
1358      llama_model * lmodel = nullptr;
1359      const cmd_params_instance * prev_inst = nullptr;
1360  
1361      for (const auto & inst : params_instances) {
1362          // keep the same model between tests when possible
1363          if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
1364              if (lmodel) {
1365                  llama_free_model(lmodel);
1366              }
1367  
1368              lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
1369              if (lmodel == NULL) {
1370                  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
1371                  return 1;
1372              }
1373              prev_inst = &inst;
1374          }
1375  
1376          llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
1377          if (ctx == NULL) {
1378              fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
1379              llama_free_model(lmodel);
1380              return 1;
1381          }
1382  
1383          test t(inst, lmodel, ctx);
1384  
1385          llama_kv_cache_clear(ctx);
1386  
1387          // warmup run
1388          if (t.n_prompt > 0) {
1389              //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1390              test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1391          }
1392          if (t.n_gen > 0) {
1393              test_gen(ctx, 1, 0, t.n_threads);
1394          }
1395  
1396          for (int i = 0; i < params.reps; i++) {
1397              llama_kv_cache_clear(ctx);
1398  
1399              uint64_t t_start = get_time_ns();
1400  
1401              if (t.n_prompt > 0) {
1402                  test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1403              }
1404              if (t.n_gen > 0) {
1405                  test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
1406              }
1407  
1408              uint64_t t_ns = get_time_ns() - t_start;
1409              t.samples_ns.push_back(t_ns);
1410          }
1411  
1412          if (p) {
1413              p->print_test(t);
1414              fflush(p->fout);
1415          }
1416  
1417          if (p_err) {
1418              p_err->print_test(t);
1419              fflush(p_err->fout);
1420          }
1421  
1422          llama_print_timings(ctx);
1423  
1424          llama_free(ctx);
1425      }
1426  
1427      llama_free_model(lmodel);
1428  
1429      if (p) {
1430          p->print_footer();
1431      }
1432  
1433      if (p_err) {
1434          p_err->print_footer();
1435      }
1436  
1437      llama_backend_free();
1438  
1439      return 0;
1440  }