llama-bench.cpp
1 #include <algorithm> 2 #include <array> 3 #include <cassert> 4 #include <chrono> 5 #include <cinttypes> 6 #include <clocale> 7 #include <cmath> 8 #include <cstdio> 9 #include <cstring> 10 #include <ctime> 11 #include <cstdlib> 12 #include <iterator> 13 #include <map> 14 #include <numeric> 15 #include <regex> 16 #include <sstream> 17 #include <string> 18 #include <vector> 19 20 #include "ggml.h" 21 #include "llama.h" 22 #include "common.h" 23 #include "ggml-cuda.h" 24 #include "ggml-sycl.h" 25 26 // utils 27 static uint64_t get_time_ns() { 28 using clock = std::chrono::high_resolution_clock; 29 return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); 30 } 31 32 template<class T> 33 static std::string join(const std::vector<T> & values, const std::string & delim) { 34 std::ostringstream str; 35 for (size_t i = 0; i < values.size(); i++) { 36 str << values[i]; 37 if (i < values.size() - 1) { 38 str << delim; 39 } 40 } 41 return str.str(); 42 } 43 44 template<typename T, typename F> 45 static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) { 46 std::vector<std::string> str_values; 47 std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); 48 return str_values; 49 } 50 51 template<typename T> 52 static T avg(const std::vector<T> & v) { 53 if (v.empty()) { 54 return 0; 55 } 56 T sum = std::accumulate(v.begin(), v.end(), T(0)); 57 return sum / (T)v.size(); 58 } 59 60 template<typename T> 61 static T stdev(const std::vector<T> & v) { 62 if (v.size() <= 1) { 63 return 0; 64 } 65 T mean = avg(v); 66 T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); 67 T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1)); 68 return stdev; 69 } 70 71 static std::string get_cpu_info() { 72 std::string id; 73 #ifdef __linux__ 74 FILE * f = fopen("/proc/cpuinfo", "r"); 75 if (f) { 76 char buf[1024]; 77 while (fgets(buf, sizeof(buf), f)) { 78 if (strncmp(buf, "model name", 10) == 0) { 79 char * p = strchr(buf, ':'); 80 if (p) { 81 p++; 82 while (std::isspace(*p)) { 83 p++; 84 } 85 while (std::isspace(p[strlen(p) - 1])) { 86 p[strlen(p) - 1] = '\0'; 87 } 88 id = p; 89 break; 90 } 91 } 92 } 93 fclose(f); 94 } 95 #endif 96 // TODO: other platforms 97 return id; 98 } 99 100 static std::string get_gpu_info() { 101 std::string id; 102 #ifdef GGML_USE_CUDA 103 int count = ggml_backend_cuda_get_device_count(); 104 for (int i = 0; i < count; i++) { 105 char buf[128]; 106 ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); 107 id += buf; 108 if (i < count - 1) { 109 id += "/"; 110 } 111 } 112 #endif 113 #ifdef GGML_USE_SYCL 114 int count = ggml_backend_sycl_get_device_count(); 115 for (int i = 0; i < count; i++) { 116 char buf[128]; 117 ggml_sycl_get_device_description(i, buf, sizeof(buf)); 118 id += buf; 119 if (i < count - 1) { 120 id += "/"; 121 } 122 } 123 #endif 124 // TODO: other backends 125 return id; 126 } 127 128 // command line params 129 enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL}; 130 131 static const char * output_format_str(output_formats format) { 132 switch (format) { 133 case NONE: return "none"; 134 case CSV: return "csv"; 135 case JSON: return "json"; 136 case MARKDOWN: return "md"; 137 case SQL: return "sql"; 138 default: GGML_ASSERT(!"invalid output format"); 139 } 140 } 141 142 static bool output_format_from_str(const std::string & s, output_formats & format) { 143 if (s == "none") { 144 format = NONE; 145 } else if (s == "csv") { 146 format = CSV; 147 } else if (s == "json") { 148 format = JSON; 149 } else if (s == "md") { 150 format = MARKDOWN; 151 } else if (s == "sql") { 152 format = SQL; 153 } else { 154 return false; 155 } 156 return true; 157 } 158 159 static const char * split_mode_str(llama_split_mode mode) { 160 switch (mode) { 161 case LLAMA_SPLIT_MODE_NONE: return "none"; 162 case LLAMA_SPLIT_MODE_LAYER: return "layer"; 163 case LLAMA_SPLIT_MODE_ROW: return "row"; 164 default: GGML_ASSERT(!"invalid split mode"); 165 } 166 } 167 168 static std::string pair_str(const std::pair<int, int> & p) { 169 static char buf[32]; 170 snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second); 171 return buf; 172 } 173 174 struct cmd_params { 175 std::vector<std::string> model; 176 std::vector<int> n_prompt; 177 std::vector<int> n_gen; 178 std::vector<std::pair<int, int>> n_pg; 179 std::vector<int> n_batch; 180 std::vector<int> n_ubatch; 181 std::vector<ggml_type> type_k; 182 std::vector<ggml_type> type_v; 183 std::vector<int> n_threads; 184 std::vector<int> n_gpu_layers; 185 std::vector<std::string> rpc_servers; 186 std::vector<llama_split_mode> split_mode; 187 std::vector<int> main_gpu; 188 std::vector<bool> no_kv_offload; 189 std::vector<bool> flash_attn; 190 std::vector<std::vector<float>> tensor_split; 191 std::vector<bool> use_mmap; 192 std::vector<bool> embeddings; 193 ggml_numa_strategy numa; 194 int reps; 195 bool verbose; 196 output_formats output_format; 197 output_formats output_format_stderr; 198 }; 199 200 static const cmd_params cmd_params_defaults = { 201 /* model */ {"models/7B/ggml-model-q4_0.gguf"}, 202 /* n_prompt */ {512}, 203 /* n_gen */ {128}, 204 /* n_pg */ {}, 205 /* n_batch */ {2048}, 206 /* n_ubatch */ {512}, 207 /* type_k */ {GGML_TYPE_F16}, 208 /* type_v */ {GGML_TYPE_F16}, 209 /* n_threads */ {cpu_get_num_math()}, 210 /* n_gpu_layers */ {99}, 211 /* rpc_servers */ {""}, 212 /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, 213 /* main_gpu */ {0}, 214 /* no_kv_offload */ {false}, 215 /* flash_attn */ {false}, 216 /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)}, 217 /* use_mmap */ {true}, 218 /* embeddings */ {false}, 219 /* numa */ GGML_NUMA_STRATEGY_DISABLED, 220 /* reps */ 5, 221 /* verbose */ false, 222 /* output_format */ MARKDOWN, 223 /* output_format_stderr */ NONE, 224 }; 225 226 static void print_usage(int /* argc */, char ** argv) { 227 printf("usage: %s [options]\n", argv[0]); 228 printf("\n"); 229 printf("options:\n"); 230 printf(" -h, --help\n"); 231 printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); 232 printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); 233 printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); 234 printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); 235 printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); 236 printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); 237 printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); 238 printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); 239 printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); 240 printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); 241 printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); 242 printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); 243 printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); 244 printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); 245 printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); 246 printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); 247 printf(" --numa <distribute|isolate|numactl> (default: disabled)\n"); 248 printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); 249 printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); 250 printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); 251 printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); 252 printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); 253 printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); 254 printf("\n"); 255 printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); 256 } 257 258 static ggml_type ggml_type_from_name(const std::string & s) { 259 if (s == "f16") { 260 return GGML_TYPE_F16; 261 } 262 if (s == "q8_0") { 263 return GGML_TYPE_Q8_0; 264 } 265 if (s == "q4_0") { 266 return GGML_TYPE_Q4_0; 267 } 268 if (s == "q4_1") { 269 return GGML_TYPE_Q4_1; 270 } 271 if (s == "q5_0") { 272 return GGML_TYPE_Q5_0; 273 } 274 if (s == "q5_1") { 275 return GGML_TYPE_Q5_1; 276 } 277 if (s == "iq4_nl") { 278 return GGML_TYPE_IQ4_NL; 279 } 280 281 return GGML_TYPE_COUNT; 282 } 283 284 285 static cmd_params parse_cmd_params(int argc, char ** argv) { 286 cmd_params params; 287 std::string arg; 288 bool invalid_param = false; 289 const std::string arg_prefix = "--"; 290 const char split_delim = ','; 291 292 params.verbose = cmd_params_defaults.verbose; 293 params.output_format = cmd_params_defaults.output_format; 294 params.output_format_stderr = cmd_params_defaults.output_format_stderr; 295 params.reps = cmd_params_defaults.reps; 296 params.numa = cmd_params_defaults.numa; 297 298 for (int i = 1; i < argc; i++) { 299 arg = argv[i]; 300 if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { 301 std::replace(arg.begin(), arg.end(), '_', '-'); 302 } 303 304 if (arg == "-h" || arg == "--help") { 305 print_usage(argc, argv); 306 exit(0); 307 } else if (arg == "-m" || arg == "--model") { 308 if (++i >= argc) { 309 invalid_param = true; 310 break; 311 } 312 auto p = string_split<std::string>(argv[i], split_delim); 313 params.model.insert(params.model.end(), p.begin(), p.end()); 314 } else if (arg == "-p" || arg == "--n-prompt") { 315 if (++i >= argc) { 316 invalid_param = true; 317 break; 318 } 319 auto p = string_split<int>(argv[i], split_delim); 320 params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); 321 } else if (arg == "-n" || arg == "--n-gen") { 322 if (++i >= argc) { 323 invalid_param = true; 324 break; 325 } 326 auto p = string_split<int>(argv[i], split_delim); 327 params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); 328 } else if (arg == "-pg") { 329 if (++i >= argc) { 330 invalid_param = true; 331 break; 332 } 333 auto p = string_split<std::string>(argv[i], ','); 334 if (p.size() != 2) { 335 invalid_param = true; 336 break; 337 } 338 params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); 339 } else if (arg == "-b" || arg == "--batch-size") { 340 if (++i >= argc) { 341 invalid_param = true; 342 break; 343 } 344 auto p = string_split<int>(argv[i], split_delim); 345 params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); 346 } else if (arg == "-ub" || arg == "--ubatch-size") { 347 if (++i >= argc) { 348 invalid_param = true; 349 break; 350 } 351 auto p = string_split<int>(argv[i], split_delim); 352 params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); 353 } else if (arg == "-ctk" || arg == "--cache-type-k") { 354 if (++i >= argc) { 355 invalid_param = true; 356 break; 357 } 358 auto p = string_split<std::string>(argv[i], split_delim); 359 std::vector<ggml_type> types; 360 for (const auto & t : p) { 361 ggml_type gt = ggml_type_from_name(t); 362 if (gt == GGML_TYPE_COUNT) { 363 invalid_param = true; 364 break; 365 } 366 types.push_back(gt); 367 } 368 params.type_k.insert(params.type_k.end(), types.begin(), types.end()); 369 } else if (arg == "-ctv" || arg == "--cache-type-v") { 370 if (++i >= argc) { 371 invalid_param = true; 372 break; 373 } 374 auto p = string_split<std::string>(argv[i], split_delim); 375 std::vector<ggml_type> types; 376 for (const auto & t : p) { 377 ggml_type gt = ggml_type_from_name(t); 378 if (gt == GGML_TYPE_COUNT) { 379 invalid_param = true; 380 break; 381 } 382 types.push_back(gt); 383 } 384 params.type_v.insert(params.type_v.end(), types.begin(), types.end()); 385 } else if (arg == "-t" || arg == "--threads") { 386 if (++i >= argc) { 387 invalid_param = true; 388 break; 389 } 390 auto p = string_split<int>(argv[i], split_delim); 391 params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); 392 } else if (arg == "-ngl" || arg == "--n-gpu-layers") { 393 if (++i >= argc) { 394 invalid_param = true; 395 break; 396 } 397 auto p = string_split<int>(argv[i], split_delim); 398 params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); 399 } else if (arg == "-rpc" || arg == "--rpc") { 400 if (++i >= argc) { 401 invalid_param = true; 402 break; 403 } 404 params.rpc_servers.push_back(argv[i]); 405 } else if (arg == "-sm" || arg == "--split-mode") { 406 if (++i >= argc) { 407 invalid_param = true; 408 break; 409 } 410 auto p = string_split<std::string>(argv[i], split_delim); 411 std::vector<llama_split_mode> modes; 412 for (const auto & m : p) { 413 llama_split_mode mode; 414 if (m == "none") { 415 mode = LLAMA_SPLIT_MODE_NONE; 416 } else if (m == "layer") { 417 mode = LLAMA_SPLIT_MODE_LAYER; 418 } else if (m == "row") { 419 mode = LLAMA_SPLIT_MODE_ROW; 420 } else { 421 invalid_param = true; 422 break; 423 } 424 modes.push_back(mode); 425 } 426 params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); 427 } else if (arg == "-mg" || arg == "--main-gpu") { 428 if (++i >= argc) { 429 invalid_param = true; 430 break; 431 } 432 params.main_gpu = string_split<int>(argv[i], split_delim); 433 } else if (arg == "-nkvo" || arg == "--no-kv-offload") { 434 if (++i >= argc) { 435 invalid_param = true; 436 break; 437 } 438 auto p = string_split<bool>(argv[i], split_delim); 439 params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); 440 } else if (arg == "--numa") { 441 if (++i >= argc) { 442 invalid_param = true; 443 break; 444 } else { 445 std::string value(argv[i]); 446 /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } 447 else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } 448 else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } 449 else { invalid_param = true; break; } 450 } 451 } else if (arg == "-fa" || arg == "--flash-attn") { 452 if (++i >= argc) { 453 invalid_param = true; 454 break; 455 } 456 auto p = string_split<bool>(argv[i], split_delim); 457 params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); 458 } else if (arg == "-mmp" || arg == "--mmap") { 459 if (++i >= argc) { 460 invalid_param = true; 461 break; 462 } 463 auto p = string_split<bool>(argv[i], split_delim); 464 params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); 465 } else if (arg == "-embd" || arg == "--embeddings") { 466 if (++i >= argc) { 467 invalid_param = true; 468 break; 469 } 470 auto p = string_split<bool>(argv[i], split_delim); 471 params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); 472 } else if (arg == "-ts" || arg == "--tensor-split") { 473 if (++i >= argc) { 474 invalid_param = true; 475 break; 476 } 477 for (auto ts : string_split<std::string>(argv[i], split_delim)) { 478 // split string by ; and / 479 const std::regex regex{R"([;/]+)"}; 480 std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; 481 std::vector<std::string> split_arg{it, {}}; 482 GGML_ASSERT(split_arg.size() <= llama_max_devices()); 483 484 std::vector<float> tensor_split(llama_max_devices()); 485 for (size_t i = 0; i < llama_max_devices(); ++i) { 486 if (i < split_arg.size()) { 487 tensor_split[i] = std::stof(split_arg[i]); 488 } else { 489 tensor_split[i] = 0.0f; 490 } 491 } 492 params.tensor_split.push_back(tensor_split); 493 } 494 } else if (arg == "-r" || arg == "--repetitions") { 495 if (++i >= argc) { 496 invalid_param = true; 497 break; 498 } 499 params.reps = std::stoi(argv[i]); 500 } else if (arg == "-o" || arg == "--output") { 501 if (++i >= argc) { 502 invalid_param = true; 503 break; 504 } 505 invalid_param = !output_format_from_str(argv[i], params.output_format); 506 } else if (arg == "-oe" || arg == "--output-err") { 507 if (++i >= argc) { 508 invalid_param = true; 509 break; 510 } 511 invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); 512 } else if (arg == "-v" || arg == "--verbose") { 513 params.verbose = true; 514 } else { 515 invalid_param = true; 516 break; 517 } 518 } 519 if (invalid_param) { 520 fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); 521 print_usage(argc, argv); 522 exit(1); 523 } 524 525 // set defaults 526 if (params.model.empty()) { params.model = cmd_params_defaults.model; } 527 if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } 528 if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } 529 if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } 530 if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } 531 if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } 532 if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } 533 if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } 534 if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } 535 if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } 536 if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } 537 if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } 538 if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } 539 if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } 540 if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } 541 if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } 542 if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } 543 if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } 544 545 return params; 546 } 547 548 struct cmd_params_instance { 549 std::string model; 550 int n_prompt; 551 int n_gen; 552 int n_batch; 553 int n_ubatch; 554 ggml_type type_k; 555 ggml_type type_v; 556 int n_threads; 557 int n_gpu_layers; 558 std::string rpc_servers; 559 llama_split_mode split_mode; 560 int main_gpu; 561 bool no_kv_offload; 562 bool flash_attn; 563 std::vector<float> tensor_split; 564 bool use_mmap; 565 bool embeddings; 566 567 llama_model_params to_llama_mparams() const { 568 llama_model_params mparams = llama_model_default_params(); 569 570 mparams.n_gpu_layers = n_gpu_layers; 571 if (!rpc_servers.empty()) { 572 mparams.rpc_servers = rpc_servers.c_str(); 573 } 574 mparams.split_mode = split_mode; 575 mparams.main_gpu = main_gpu; 576 mparams.tensor_split = tensor_split.data(); 577 mparams.use_mmap = use_mmap; 578 579 return mparams; 580 } 581 582 bool equal_mparams(const cmd_params_instance & other) const { 583 return model == other.model && 584 n_gpu_layers == other.n_gpu_layers && 585 rpc_servers == other.rpc_servers && 586 split_mode == other.split_mode && 587 main_gpu == other.main_gpu && 588 use_mmap == other.use_mmap && 589 tensor_split == other.tensor_split; 590 } 591 592 llama_context_params to_llama_cparams() const { 593 llama_context_params cparams = llama_context_default_params(); 594 595 cparams.n_ctx = n_prompt + n_gen; 596 cparams.n_batch = n_batch; 597 cparams.n_ubatch = n_ubatch; 598 cparams.type_k = type_k; 599 cparams.type_v = type_v; 600 cparams.offload_kqv = !no_kv_offload; 601 cparams.flash_attn = flash_attn; 602 cparams.embeddings = embeddings; 603 604 return cparams; 605 } 606 }; 607 608 static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) { 609 std::vector<cmd_params_instance> instances; 610 611 // this ordering minimizes the number of times that each model needs to be reloaded 612 for (const auto & m : params.model) 613 for (const auto & nl : params.n_gpu_layers) 614 for (const auto & rpc : params.rpc_servers) 615 for (const auto & sm : params.split_mode) 616 for (const auto & mg : params.main_gpu) 617 for (const auto & ts : params.tensor_split) 618 for (const auto & mmp : params.use_mmap) 619 for (const auto & embd : params.embeddings) 620 for (const auto & nb : params.n_batch) 621 for (const auto & nub : params.n_ubatch) 622 for (const auto & tk : params.type_k) 623 for (const auto & tv : params.type_v) 624 for (const auto & nkvo : params.no_kv_offload) 625 for (const auto & fa : params.flash_attn) 626 for (const auto & nt : params.n_threads) { 627 for (const auto & n_prompt : params.n_prompt) { 628 if (n_prompt == 0) { 629 continue; 630 } 631 cmd_params_instance instance = { 632 /* .model = */ m, 633 /* .n_prompt = */ n_prompt, 634 /* .n_gen = */ 0, 635 /* .n_batch = */ nb, 636 /* .n_ubatch = */ nub, 637 /* .type_k = */ tk, 638 /* .type_v = */ tv, 639 /* .n_threads = */ nt, 640 /* .n_gpu_layers = */ nl, 641 /* .rpc_servers = */ rpc, 642 /* .split_mode = */ sm, 643 /* .main_gpu = */ mg, 644 /* .no_kv_offload= */ nkvo, 645 /* .flash_attn = */ fa, 646 /* .tensor_split = */ ts, 647 /* .use_mmap = */ mmp, 648 /* .embeddings = */ embd, 649 }; 650 instances.push_back(instance); 651 } 652 653 for (const auto & n_gen : params.n_gen) { 654 if (n_gen == 0) { 655 continue; 656 } 657 cmd_params_instance instance = { 658 /* .model = */ m, 659 /* .n_prompt = */ 0, 660 /* .n_gen = */ n_gen, 661 /* .n_batch = */ nb, 662 /* .n_ubatch = */ nub, 663 /* .type_k = */ tk, 664 /* .type_v = */ tv, 665 /* .n_threads = */ nt, 666 /* .n_gpu_layers = */ nl, 667 /* .rpc_servers = */ rpc, 668 /* .split_mode = */ sm, 669 /* .main_gpu = */ mg, 670 /* .no_kv_offload= */ nkvo, 671 /* .flash_attn = */ fa, 672 /* .tensor_split = */ ts, 673 /* .use_mmap = */ mmp, 674 /* .embeddings = */ embd, 675 }; 676 instances.push_back(instance); 677 } 678 679 for (const auto & n_pg : params.n_pg) { 680 if (n_pg.first == 0 && n_pg.second == 0) { 681 continue; 682 } 683 cmd_params_instance instance = { 684 /* .model = */ m, 685 /* .n_prompt = */ n_pg.first, 686 /* .n_gen = */ n_pg.second, 687 /* .n_batch = */ nb, 688 /* .n_ubatch = */ nub, 689 /* .type_k = */ tk, 690 /* .type_v = */ tv, 691 /* .n_threads = */ nt, 692 /* .n_gpu_layers = */ nl, 693 /* .rpc_servers = */ rpc, 694 /* .split_mode = */ sm, 695 /* .main_gpu = */ mg, 696 /* .no_kv_offload= */ nkvo, 697 /* .flash_attn = */ fa, 698 /* .tensor_split = */ ts, 699 /* .use_mmap = */ mmp, 700 /* .embeddings = */ embd, 701 }; 702 instances.push_back(instance); 703 } 704 } 705 706 return instances; 707 } 708 709 struct test { 710 static const std::string build_commit; 711 static const int build_number; 712 static const bool cuda; 713 static const bool vulkan; 714 static const bool kompute; 715 static const bool metal; 716 static const bool sycl; 717 static const bool gpu_blas; 718 static const bool blas; 719 static const std::string cpu_info; 720 static const std::string gpu_info; 721 std::string model_filename; 722 std::string model_type; 723 uint64_t model_size; 724 uint64_t model_n_params; 725 int n_batch; 726 int n_ubatch; 727 int n_threads; 728 bool has_rpc; 729 ggml_type type_k; 730 ggml_type type_v; 731 int n_gpu_layers; 732 llama_split_mode split_mode; 733 int main_gpu; 734 bool no_kv_offload; 735 bool flash_attn; 736 std::vector<float> tensor_split; 737 bool use_mmap; 738 bool embeddings; 739 int n_prompt; 740 int n_gen; 741 std::string test_time; 742 std::vector<uint64_t> samples_ns; 743 744 test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { 745 model_filename = inst.model; 746 char buf[128]; 747 llama_model_desc(lmodel, buf, sizeof(buf)); 748 model_type = buf; 749 model_size = llama_model_size(lmodel); 750 model_n_params = llama_model_n_params(lmodel); 751 n_batch = inst.n_batch; 752 n_ubatch = inst.n_ubatch; 753 n_threads = inst.n_threads; 754 has_rpc = !inst.rpc_servers.empty(); 755 type_k = inst.type_k; 756 type_v = inst.type_v; 757 n_gpu_layers = inst.n_gpu_layers; 758 split_mode = inst.split_mode; 759 main_gpu = inst.main_gpu; 760 no_kv_offload = inst.no_kv_offload; 761 flash_attn = inst.flash_attn; 762 tensor_split = inst.tensor_split; 763 use_mmap = inst.use_mmap; 764 embeddings = inst.embeddings; 765 n_prompt = inst.n_prompt; 766 n_gen = inst.n_gen; 767 // RFC 3339 date-time format 768 time_t t = time(NULL); 769 std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); 770 test_time = buf; 771 772 (void) ctx; 773 } 774 775 uint64_t avg_ns() const { 776 return ::avg(samples_ns); 777 } 778 779 uint64_t stdev_ns() const { 780 return ::stdev(samples_ns); 781 } 782 783 std::vector<double> get_ts() const { 784 int n_tokens = n_prompt + n_gen; 785 std::vector<double> ts; 786 std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); 787 return ts; 788 } 789 790 double avg_ts() const { 791 return ::avg(get_ts()); 792 } 793 794 double stdev_ts() const { 795 return ::stdev(get_ts()); 796 } 797 798 static std::string get_backend() { 799 if (cuda) { 800 return GGML_CUDA_NAME; 801 } 802 if (vulkan) { 803 return "Vulkan"; 804 } 805 if (kompute) { 806 return "Kompute"; 807 } 808 if (metal) { 809 return "Metal"; 810 } 811 if (sycl) { 812 return GGML_SYCL_NAME; 813 } 814 if (gpu_blas) { 815 return "GPU BLAS"; 816 } 817 if (blas) { 818 return "BLAS"; 819 } 820 821 return "CPU"; 822 } 823 824 static const std::vector<std::string> & get_fields() { 825 static const std::vector<std::string> fields = { 826 "build_commit", "build_number", 827 "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas", 828 "cpu_info", "gpu_info", 829 "model_filename", "model_type", "model_size", "model_n_params", 830 "n_batch", "n_ubatch", 831 "n_threads", "type_k", "type_v", 832 "n_gpu_layers", "split_mode", 833 "main_gpu", "no_kv_offload", "flash_attn", 834 "tensor_split", "use_mmap", "embeddings", 835 "n_prompt", "n_gen", "test_time", 836 "avg_ns", "stddev_ns", 837 "avg_ts", "stddev_ts" 838 }; 839 return fields; 840 } 841 842 enum field_type {STRING, BOOL, INT, FLOAT}; 843 844 static field_type get_field_type(const std::string & field) { 845 if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || 846 field == "n_threads" || 847 field == "model_size" || field == "model_n_params" || 848 field == "n_gpu_layers" || field == "main_gpu" || 849 field == "n_prompt" || field == "n_gen" || 850 field == "avg_ns" || field == "stddev_ns") { 851 return INT; 852 } 853 if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || 854 field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || 855 field == "flash_attn" || field == "use_mmap" || field == "embeddings") { 856 return BOOL; 857 } 858 if (field == "avg_ts" || field == "stddev_ts") { 859 return FLOAT; 860 } 861 return STRING; 862 } 863 864 std::vector<std::string> get_values() const { 865 std::string tensor_split_str; 866 int max_nonzero = 0; 867 for (size_t i = 0; i < llama_max_devices(); i++) { 868 if (tensor_split[i] > 0) { 869 max_nonzero = i; 870 } 871 } 872 for (int i = 0; i <= max_nonzero; i++) { 873 char buf[32]; 874 snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]); 875 tensor_split_str += buf; 876 if (i < max_nonzero) { 877 tensor_split_str += "/"; 878 } 879 } 880 std::vector<std::string> values = { 881 build_commit, std::to_string(build_number), 882 std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan), 883 std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas), 884 cpu_info, gpu_info, 885 model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), 886 std::to_string(n_batch), std::to_string(n_ubatch), 887 std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), 888 std::to_string(n_gpu_layers), split_mode_str(split_mode), 889 std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), 890 tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), 891 std::to_string(n_prompt), std::to_string(n_gen), test_time, 892 std::to_string(avg_ns()), std::to_string(stdev_ns()), 893 std::to_string(avg_ts()), std::to_string(stdev_ts()) 894 }; 895 return values; 896 } 897 898 std::map<std::string, std::string> get_map() const { 899 std::map<std::string, std::string> map; 900 auto fields = get_fields(); 901 auto values = get_values(); 902 std::transform(fields.begin(), fields.end(), values.begin(), 903 std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>); 904 return map; 905 } 906 }; 907 908 const std::string test::build_commit = LLAMA_COMMIT; 909 const int test::build_number = LLAMA_BUILD_NUMBER; 910 const bool test::cuda = !!ggml_cpu_has_cuda(); 911 const bool test::vulkan = !!ggml_cpu_has_vulkan(); 912 const bool test::kompute = !!ggml_cpu_has_kompute(); 913 const bool test::metal = !!ggml_cpu_has_metal(); 914 const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); 915 const bool test::blas = !!ggml_cpu_has_blas(); 916 const bool test::sycl = !!ggml_cpu_has_sycl(); 917 const std::string test::cpu_info = get_cpu_info(); 918 const std::string test::gpu_info = get_gpu_info(); 919 920 struct printer { 921 virtual ~printer() {} 922 923 FILE * fout; 924 virtual void print_header(const cmd_params & params) { (void) params; } 925 virtual void print_test(const test & t) = 0; 926 virtual void print_footer() { } 927 }; 928 929 struct csv_printer : public printer { 930 static std::string escape_csv(const std::string & field) { 931 std::string escaped = "\""; 932 for (auto c : field) { 933 if (c == '"') { 934 escaped += "\""; 935 } 936 escaped += c; 937 } 938 escaped += "\""; 939 return escaped; 940 } 941 942 void print_header(const cmd_params & params) override { 943 std::vector<std::string> fields = test::get_fields(); 944 fprintf(fout, "%s\n", join(fields, ",").c_str()); 945 (void) params; 946 } 947 948 void print_test(const test & t) override { 949 std::vector<std::string> values = t.get_values(); 950 std::transform(values.begin(), values.end(), values.begin(), escape_csv); 951 fprintf(fout, "%s\n", join(values, ",").c_str()); 952 } 953 }; 954 955 struct json_printer : public printer { 956 bool first = true; 957 958 static std::string escape_json(const std::string & value) { 959 std::string escaped; 960 for (auto c : value) { 961 if (c == '"') { 962 escaped += "\\\""; 963 } else if (c == '\\') { 964 escaped += "\\\\"; 965 } else if (c <= 0x1f) { 966 char buf[8]; 967 snprintf(buf, sizeof(buf), "\\u%04x", c); 968 escaped += buf; 969 } else { 970 escaped += c; 971 } 972 } 973 return escaped; 974 } 975 976 static std::string format_value(const std::string & field, const std::string & value) { 977 switch (test::get_field_type(field)) { 978 case test::STRING: 979 return "\"" + escape_json(value) + "\""; 980 case test::BOOL: 981 return value == "0" ? "false" : "true"; 982 default: 983 return value; 984 } 985 } 986 987 void print_header(const cmd_params & params) override { 988 fprintf(fout, "[\n"); 989 (void) params; 990 } 991 992 void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) { 993 assert(fields.size() == values.size()); 994 for (size_t i = 0; i < fields.size(); i++) { 995 fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str()); 996 } 997 } 998 999 void print_test(const test & t) override { 1000 if (first) { 1001 first = false; 1002 } else { 1003 fprintf(fout, ",\n"); 1004 } 1005 fprintf(fout, " {\n"); 1006 print_fields(test::get_fields(), t.get_values()); 1007 fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str()); 1008 fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str()); 1009 fprintf(fout, " }"); 1010 fflush(fout); 1011 } 1012 1013 void print_footer() override { 1014 fprintf(fout, "\n]\n"); 1015 } 1016 }; 1017 1018 struct markdown_printer : public printer { 1019 std::vector<std::string> fields; 1020 1021 static int get_field_width(const std::string & field) { 1022 if (field == "model") { 1023 return -30; 1024 } 1025 if (field == "t/s") { 1026 return 16; 1027 } 1028 if (field == "size" || field == "params") { 1029 return 10; 1030 } 1031 if (field == "n_gpu_layers") { 1032 return 3; 1033 } 1034 if (field == "n_threads") { 1035 return 7; 1036 } 1037 if (field == "n_batch") { 1038 return 7; 1039 } 1040 if (field == "n_ubatch") { 1041 return 8; 1042 } 1043 if (field == "type_k" || field == "type_v") { 1044 return 6; 1045 } 1046 if (field == "split_mode") { 1047 return 5; 1048 } 1049 if (field == "flash_attn") { 1050 return 2; 1051 } 1052 if (field == "use_mmap") { 1053 return 4; 1054 } 1055 if (field == "test") { 1056 return 13; 1057 } 1058 1059 int width = std::max((int)field.length(), 10); 1060 1061 if (test::get_field_type(field) == test::STRING) { 1062 return -width; 1063 } 1064 return width; 1065 } 1066 1067 static std::string get_field_display_name(const std::string & field) { 1068 if (field == "n_gpu_layers") { 1069 return "ngl"; 1070 } 1071 if (field == "split_mode") { 1072 return "sm"; 1073 } 1074 if (field == "n_threads") { 1075 return "threads"; 1076 } 1077 if (field == "no_kv_offload") { 1078 return "nkvo"; 1079 } 1080 if (field == "flash_attn") { 1081 return "fa"; 1082 } 1083 if (field == "use_mmap") { 1084 return "mmap"; 1085 } 1086 if (field == "embeddings") { 1087 return "embd"; 1088 } 1089 if (field == "tensor_split") { 1090 return "ts"; 1091 } 1092 return field; 1093 } 1094 1095 void print_header(const cmd_params & params) override { 1096 // select fields to print 1097 fields.emplace_back("model"); 1098 fields.emplace_back("size"); 1099 fields.emplace_back("params"); 1100 fields.emplace_back("backend"); 1101 bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; 1102 if (!is_cpu_backend) { 1103 fields.emplace_back("n_gpu_layers"); 1104 } 1105 if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { 1106 fields.emplace_back("n_threads"); 1107 } 1108 if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { 1109 fields.emplace_back("n_batch"); 1110 } 1111 if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) { 1112 fields.emplace_back("n_ubatch"); 1113 } 1114 if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { 1115 fields.emplace_back("type_k"); 1116 } 1117 if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { 1118 fields.emplace_back("type_v"); 1119 } 1120 if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { 1121 fields.emplace_back("main_gpu"); 1122 } 1123 if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { 1124 fields.emplace_back("split_mode"); 1125 } 1126 if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { 1127 fields.emplace_back("no_kv_offload"); 1128 } 1129 if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) { 1130 fields.emplace_back("flash_attn"); 1131 } 1132 if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { 1133 fields.emplace_back("tensor_split"); 1134 } 1135 if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { 1136 fields.emplace_back("use_mmap"); 1137 } 1138 if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { 1139 fields.emplace_back("embeddings"); 1140 } 1141 fields.emplace_back("test"); 1142 fields.emplace_back("t/s"); 1143 1144 fprintf(fout, "|"); 1145 for (const auto & field : fields) { 1146 fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str()); 1147 } 1148 fprintf(fout, "\n"); 1149 fprintf(fout, "|"); 1150 for (const auto & field : fields) { 1151 int width = get_field_width(field); 1152 fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-"); 1153 } 1154 fprintf(fout, "\n"); 1155 } 1156 1157 void print_test(const test & t) override { 1158 std::map<std::string, std::string> vmap = t.get_map(); 1159 1160 fprintf(fout, "|"); 1161 for (const auto & field : fields) { 1162 std::string value; 1163 char buf[128]; 1164 if (field == "model") { 1165 value = t.model_type; 1166 } else if (field == "size") { 1167 if (t.model_size < 1024*1024*1024) { 1168 snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); 1169 } else { 1170 snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); 1171 } 1172 value = buf; 1173 } else if (field == "params") { 1174 if (t.model_n_params < 1000*1000*1000) { 1175 snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); 1176 } else { 1177 snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); 1178 } 1179 value = buf; 1180 } else if (field == "backend") { 1181 value = test::get_backend(); 1182 if (t.has_rpc) { 1183 value += "+RPC"; 1184 } 1185 } else if (field == "test") { 1186 if (t.n_prompt > 0 && t.n_gen == 0) { 1187 snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); 1188 } else if (t.n_gen > 0 && t.n_prompt == 0) { 1189 snprintf(buf, sizeof(buf), "tg%d", t.n_gen); 1190 } else { 1191 snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); 1192 } 1193 value = buf; 1194 } else if (field == "t/s") { 1195 snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); 1196 value = buf; 1197 } else if (vmap.find(field) != vmap.end()) { 1198 value = vmap.at(field); 1199 } else { 1200 assert(false); 1201 exit(1); 1202 } 1203 1204 int width = get_field_width(field); 1205 if (field == "t/s") { 1206 // HACK: the utf-8 character is 2 bytes 1207 width += 1; 1208 } 1209 fprintf(fout, " %*s |", width, value.c_str()); 1210 } 1211 fprintf(fout, "\n"); 1212 } 1213 1214 void print_footer() override { 1215 fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number); 1216 } 1217 }; 1218 1219 struct sql_printer : public printer { 1220 static std::string get_sql_field_type(const std::string & field) { 1221 switch (test::get_field_type(field)) { 1222 case test::STRING: 1223 return "TEXT"; 1224 case test::BOOL: 1225 case test::INT: 1226 return "INTEGER"; 1227 case test::FLOAT: 1228 return "REAL"; 1229 default: 1230 assert(false); 1231 exit(1); 1232 } 1233 } 1234 1235 void print_header(const cmd_params & params) override { 1236 std::vector<std::string> fields = test::get_fields(); 1237 fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n"); 1238 for (size_t i = 0; i < fields.size(); i++) { 1239 fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), i < fields.size() - 1 ? "," : ""); 1240 } 1241 fprintf(fout, ");\n"); 1242 fprintf(fout, "\n"); 1243 (void) params; 1244 } 1245 1246 void print_test(const test & t) override { 1247 fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str()); 1248 fprintf(fout, "VALUES ("); 1249 std::vector<std::string> values = t.get_values(); 1250 for (size_t i = 0; i < values.size(); i++) { 1251 fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : ""); 1252 } 1253 fprintf(fout, ");\n"); 1254 } 1255 }; 1256 1257 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { 1258 llama_set_n_threads(ctx, n_threads, n_threads); 1259 1260 const llama_model * model = llama_get_model(ctx); 1261 const int32_t n_vocab = llama_n_vocab(model); 1262 1263 std::vector<llama_token> tokens(n_batch); 1264 1265 int n_processed = 0; 1266 1267 while (n_processed < n_prompt) { 1268 int n_tokens = std::min(n_prompt - n_processed, n_batch); 1269 tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; 1270 for (int i = 1; i < n_tokens; i++) { 1271 tokens[i] = std::rand() % n_vocab; 1272 } 1273 llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); 1274 n_processed += n_tokens; 1275 } 1276 1277 llama_synchronize(ctx); 1278 } 1279 1280 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { 1281 llama_set_n_threads(ctx, n_threads, n_threads); 1282 1283 const llama_model * model = llama_get_model(ctx); 1284 const int32_t n_vocab = llama_n_vocab(model); 1285 1286 llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; 1287 1288 for (int i = 0; i < n_gen; i++) { 1289 llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); 1290 llama_synchronize(ctx); 1291 token = std::rand() % n_vocab; 1292 } 1293 } 1294 1295 static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) { 1296 (void) level; 1297 (void) text; 1298 (void) user_data; 1299 } 1300 1301 static std::unique_ptr<printer> create_printer(output_formats format) { 1302 switch (format) { 1303 case NONE: 1304 return nullptr; 1305 case CSV: 1306 return std::unique_ptr<printer>(new csv_printer()); 1307 case JSON: 1308 return std::unique_ptr<printer>(new json_printer()); 1309 case MARKDOWN: 1310 return std::unique_ptr<printer>(new markdown_printer()); 1311 case SQL: 1312 return std::unique_ptr<printer>(new sql_printer()); 1313 } 1314 GGML_ASSERT(false); 1315 } 1316 1317 int main(int argc, char ** argv) { 1318 // try to set locale for unicode characters in markdown 1319 setlocale(LC_CTYPE, ".UTF-8"); 1320 1321 #if !defined(NDEBUG) 1322 fprintf(stderr, "warning: asserts enabled, performance may be affected\n"); 1323 #endif 1324 1325 #if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__)) 1326 fprintf(stderr, "warning: debug build, performance may be affected\n"); 1327 #endif 1328 1329 #if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__) 1330 fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n"); 1331 #endif 1332 1333 cmd_params params = parse_cmd_params(argc, argv); 1334 1335 // initialize llama.cpp 1336 if (!params.verbose) { 1337 llama_log_set(llama_null_log_callback, NULL); 1338 } 1339 llama_backend_init(); 1340 llama_numa_init(params.numa); 1341 1342 // initialize printer 1343 std::unique_ptr<printer> p = create_printer(params.output_format); 1344 std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr); 1345 1346 if (p) { 1347 p->fout = stdout; 1348 p->print_header(params); 1349 } 1350 1351 if (p_err) { 1352 p_err->fout = stderr; 1353 p_err->print_header(params); 1354 } 1355 1356 std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params); 1357 1358 llama_model * lmodel = nullptr; 1359 const cmd_params_instance * prev_inst = nullptr; 1360 1361 for (const auto & inst : params_instances) { 1362 // keep the same model between tests when possible 1363 if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { 1364 if (lmodel) { 1365 llama_free_model(lmodel); 1366 } 1367 1368 lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams()); 1369 if (lmodel == NULL) { 1370 fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); 1371 return 1; 1372 } 1373 prev_inst = &inst; 1374 } 1375 1376 llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); 1377 if (ctx == NULL) { 1378 fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); 1379 llama_free_model(lmodel); 1380 return 1; 1381 } 1382 1383 test t(inst, lmodel, ctx); 1384 1385 llama_kv_cache_clear(ctx); 1386 1387 // warmup run 1388 if (t.n_prompt > 0) { 1389 //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); 1390 test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); 1391 } 1392 if (t.n_gen > 0) { 1393 test_gen(ctx, 1, 0, t.n_threads); 1394 } 1395 1396 for (int i = 0; i < params.reps; i++) { 1397 llama_kv_cache_clear(ctx); 1398 1399 uint64_t t_start = get_time_ns(); 1400 1401 if (t.n_prompt > 0) { 1402 test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); 1403 } 1404 if (t.n_gen > 0) { 1405 test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); 1406 } 1407 1408 uint64_t t_ns = get_time_ns() - t_start; 1409 t.samples_ns.push_back(t_ns); 1410 } 1411 1412 if (p) { 1413 p->print_test(t); 1414 fflush(p->fout); 1415 } 1416 1417 if (p_err) { 1418 p_err->print_test(t); 1419 fflush(p_err->fout); 1420 } 1421 1422 llama_print_timings(ctx); 1423 1424 llama_free(ctx); 1425 } 1426 1427 llama_free_model(lmodel); 1428 1429 if (p) { 1430 p->print_footer(); 1431 } 1432 1433 if (p_err) { 1434 p_err->print_footer(); 1435 } 1436 1437 llama_backend_free(); 1438 1439 return 0; 1440 }