common.cpp
1 #include "common.h" 2 // Change JSON_ASSERT from assert() to GGML_ASSERT: 3 #define JSON_ASSERT GGML_ASSERT 4 #include "json.hpp" 5 #include "json-schema-to-grammar.h" 6 #include "llama.h" 7 8 #include <algorithm> 9 #include <cassert> 10 #include <cinttypes> 11 #include <cmath> 12 #include <codecvt> 13 #include <cstdarg> 14 #include <cstring> 15 #include <ctime> 16 #include <fstream> 17 #include <iostream> 18 #include <iterator> 19 #include <regex> 20 #include <sstream> 21 #include <string> 22 #include <unordered_map> 23 #include <unordered_set> 24 #include <vector> 25 26 #if defined(__APPLE__) && defined(__MACH__) 27 #include <sys/types.h> 28 #include <sys/sysctl.h> 29 #endif 30 31 #if defined(_WIN32) 32 #define WIN32_LEAN_AND_MEAN 33 #ifndef NOMINMAX 34 # define NOMINMAX 35 #endif 36 #include <locale> 37 #include <windows.h> 38 #include <fcntl.h> 39 #include <io.h> 40 #else 41 #include <sys/ioctl.h> 42 #include <sys/stat.h> 43 #include <unistd.h> 44 #endif 45 #if defined(LLAMA_USE_CURL) 46 #include <curl/curl.h> 47 #include <curl/easy.h> 48 #include <thread> 49 #include <future> 50 #endif 51 52 #if defined(_MSC_VER) 53 #pragma warning(disable: 4244 4267) // possible loss of data 54 #endif 55 56 #if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) 57 #define GGML_USE_CUDA_SYCL 58 #endif 59 60 #if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) 61 #define GGML_USE_CUDA_SYCL_VULKAN 62 #endif 63 64 #if defined(LLAMA_USE_CURL) 65 #ifdef __linux__ 66 #include <linux/limits.h> 67 #elif defined(_WIN32) 68 #define PATH_MAX MAX_PATH 69 #else 70 #include <sys/syslimits.h> 71 #endif 72 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 73 #endif // LLAMA_USE_CURL 74 75 using json = nlohmann::ordered_json; 76 77 // 78 // CPU utils 79 // 80 81 int32_t cpu_get_num_physical_cores() { 82 #ifdef __linux__ 83 // enumerate the set of thread siblings, num entries is num cores 84 std::unordered_set<std::string> siblings; 85 for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { 86 std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" 87 + std::to_string(cpu) + "/topology/thread_siblings"); 88 if (!thread_siblings.is_open()) { 89 break; // no more cpus 90 } 91 std::string line; 92 if (std::getline(thread_siblings, line)) { 93 siblings.insert(line); 94 } 95 } 96 if (!siblings.empty()) { 97 return static_cast<int32_t>(siblings.size()); 98 } 99 #elif defined(__APPLE__) && defined(__MACH__) 100 int32_t num_physical_cores; 101 size_t len = sizeof(num_physical_cores); 102 int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); 103 if (result == 0) { 104 return num_physical_cores; 105 } 106 result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0); 107 if (result == 0) { 108 return num_physical_cores; 109 } 110 #elif defined(_WIN32) 111 //TODO: Implement 112 #endif 113 unsigned int n_threads = std::thread::hardware_concurrency(); 114 return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; 115 } 116 117 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) 118 #include <pthread.h> 119 120 static void cpuid(unsigned leaf, unsigned subleaf, 121 unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) { 122 __asm__("movq\t%%rbx,%%rsi\n\t" 123 "cpuid\n\t" 124 "xchgq\t%%rbx,%%rsi" 125 : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx) 126 : "0"(leaf), "2"(subleaf)); 127 } 128 129 static int pin_cpu(int cpu) { 130 cpu_set_t mask; 131 CPU_ZERO(&mask); 132 CPU_SET(cpu, &mask); 133 return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); 134 } 135 136 static bool is_hybrid_cpu(void) { 137 unsigned eax, ebx, ecx, edx; 138 cpuid(7, 0, &eax, &ebx, &ecx, &edx); 139 return !!(edx & (1u << 15)); 140 } 141 142 static bool is_running_on_efficiency_core(void) { 143 unsigned eax, ebx, ecx, edx; 144 cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx); 145 int intel_atom = 0x20; 146 int core_type = (eax & 0xff000000u) >> 24; 147 return core_type == intel_atom; 148 } 149 150 static int cpu_count_math_cpus(int n_cpu) { 151 int result = 0; 152 for (int cpu = 0; cpu < n_cpu; ++cpu) { 153 if (pin_cpu(cpu)) { 154 return -1; 155 } 156 if (is_running_on_efficiency_core()) { 157 continue; // efficiency cores harm lockstep threading 158 } 159 ++cpu; // hyperthreading isn't useful for linear algebra 160 ++result; 161 } 162 return result; 163 } 164 165 #endif // __x86_64__ && __linux__ 166 167 /** 168 * Returns number of CPUs on system that are useful for math. 169 */ 170 int32_t cpu_get_num_math() { 171 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) 172 int n_cpu = sysconf(_SC_NPROCESSORS_ONLN); 173 if (n_cpu < 1) { 174 return cpu_get_num_physical_cores(); 175 } 176 if (is_hybrid_cpu()) { 177 cpu_set_t affinity; 178 if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) { 179 int result = cpu_count_math_cpus(n_cpu); 180 pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity); 181 if (result > 0) { 182 return result; 183 } 184 } 185 } 186 #endif 187 return cpu_get_num_physical_cores(); 188 } 189 190 // 191 // CLI argument parsing 192 // 193 194 void gpt_params_handle_model_default(gpt_params & params) { 195 if (!params.hf_repo.empty()) { 196 // short-hand to avoid specifying --hf-file -> default it to --model 197 if (params.hf_file.empty()) { 198 if (params.model.empty()) { 199 throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); 200 } 201 params.hf_file = params.model; 202 } else if (params.model.empty()) { 203 params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); 204 } 205 } else if (!params.model_url.empty()) { 206 if (params.model.empty()) { 207 auto f = string_split(params.model_url, '#').front(); 208 f = string_split(f, '?').front(); 209 params.model = fs_get_cache_file(string_split(f, '/').back()); 210 } 211 } else if (params.model.empty()) { 212 params.model = DEFAULT_MODEL_PATH; 213 } 214 } 215 216 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { 217 bool invalid_param = false; 218 std::string arg; 219 const std::string arg_prefix = "--"; 220 llama_sampling_params & sparams = params.sparams; 221 222 for (int i = 1; i < argc; i++) { 223 arg = argv[i]; 224 if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { 225 std::replace(arg.begin(), arg.end(), '_', '-'); 226 } 227 if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { 228 throw std::invalid_argument("error: unknown argument: " + arg); 229 } 230 if (invalid_param) { 231 throw std::invalid_argument("error: invalid parameter for argument: " + arg); 232 } 233 } 234 235 if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { 236 throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); 237 } 238 239 gpt_params_handle_model_default(params); 240 241 if (params.escape) { 242 string_process_escapes(params.prompt); 243 string_process_escapes(params.input_prefix); 244 string_process_escapes(params.input_suffix); 245 string_process_escapes(sparams.cfg_negative_prompt); 246 for (auto & antiprompt : params.antiprompt) { 247 string_process_escapes(antiprompt); 248 } 249 } 250 251 if (!params.kv_overrides.empty()) { 252 params.kv_overrides.emplace_back(); 253 params.kv_overrides.back().key[0] = 0; 254 } 255 256 return true; 257 } 258 259 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { 260 const auto params_org = params; // the example can modify the default params 261 262 try { 263 if (!gpt_params_parse_ex(argc, argv, params) || params.usage) { 264 params = params_org; 265 params.usage = true; 266 return false; 267 } 268 } catch (const std::invalid_argument & ex) { 269 fprintf(stderr, "%s\n", ex.what()); 270 params = params_org; 271 return false; 272 } 273 274 return true; 275 } 276 277 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { 278 const char split_delim = ','; 279 280 llama_sampling_params & sparams = params.sparams; 281 282 if (arg == "-s" || arg == "--seed") { 283 if (++i >= argc) { 284 invalid_param = true; 285 return true; 286 } 287 // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. 288 params.seed = std::stoul(argv[i]); 289 sparams.seed = std::stoul(argv[i]); 290 return true; 291 } 292 if (arg == "-t" || arg == "--threads") { 293 if (++i >= argc) { 294 invalid_param = true; 295 return true; 296 } 297 params.n_threads = std::stoi(argv[i]); 298 if (params.n_threads <= 0) { 299 params.n_threads = std::thread::hardware_concurrency(); 300 } 301 return true; 302 } 303 if (arg == "-tb" || arg == "--threads-batch") { 304 if (++i >= argc) { 305 invalid_param = true; 306 return true; 307 } 308 params.n_threads_batch = std::stoi(argv[i]); 309 if (params.n_threads_batch <= 0) { 310 params.n_threads_batch = std::thread::hardware_concurrency(); 311 } 312 return true; 313 } 314 if (arg == "-td" || arg == "--threads-draft") { 315 if (++i >= argc) { 316 invalid_param = true; 317 return true; 318 } 319 params.n_threads_draft = std::stoi(argv[i]); 320 if (params.n_threads_draft <= 0) { 321 params.n_threads_draft = std::thread::hardware_concurrency(); 322 } 323 return true; 324 } 325 if (arg == "-tbd" || arg == "--threads-batch-draft") { 326 if (++i >= argc) { 327 invalid_param = true; 328 return true; 329 } 330 params.n_threads_batch_draft = std::stoi(argv[i]); 331 if (params.n_threads_batch_draft <= 0) { 332 params.n_threads_batch_draft = std::thread::hardware_concurrency(); 333 } 334 return true; 335 } 336 if (arg == "-p" || arg == "--prompt") { 337 if (++i >= argc) { 338 invalid_param = true; 339 return true; 340 } 341 params.prompt = argv[i]; 342 return true; 343 } 344 if (arg == "-e" || arg == "--escape") { 345 params.escape = true; 346 return true; 347 } 348 if (arg == "--no-escape") { 349 params.escape = false; 350 return true; 351 } 352 if (arg == "--prompt-cache") { 353 if (++i >= argc) { 354 invalid_param = true; 355 return true; 356 } 357 params.path_prompt_cache = argv[i]; 358 return true; 359 } 360 if (arg == "--prompt-cache-all") { 361 params.prompt_cache_all = true; 362 return true; 363 } 364 if (arg == "--prompt-cache-ro") { 365 params.prompt_cache_ro = true; 366 return true; 367 } 368 if (arg == "-bf" || arg == "--binary-file") { 369 if (++i >= argc) { 370 invalid_param = true; 371 return true; 372 } 373 std::ifstream file(argv[i], std::ios::binary); 374 if (!file) { 375 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); 376 invalid_param = true; 377 return true; 378 } 379 // store the external file name in params 380 params.prompt_file = argv[i]; 381 std::ostringstream ss; 382 ss << file.rdbuf(); 383 params.prompt = ss.str(); 384 fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); 385 return true; 386 } 387 if (arg == "-f" || arg == "--file") { 388 if (++i >= argc) { 389 invalid_param = true; 390 return true; 391 } 392 std::ifstream file(argv[i]); 393 if (!file) { 394 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); 395 invalid_param = true; 396 return true; 397 } 398 // store the external file name in params 399 params.prompt_file = argv[i]; 400 std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt)); 401 if (!params.prompt.empty() && params.prompt.back() == '\n') { 402 params.prompt.pop_back(); 403 } 404 return true; 405 } 406 if (arg == "--in-file") { 407 if (++i >= argc) { 408 invalid_param = true; 409 return true; 410 } 411 std::ifstream file(argv[i]); 412 if (!file) { 413 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); 414 invalid_param = true; 415 return true; 416 } 417 params.in_files.push_back(argv[i]); 418 return true; 419 } 420 if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { 421 if (++i >= argc) { 422 invalid_param = true; 423 return true; 424 } 425 params.n_predict = std::stoi(argv[i]); 426 return true; 427 } 428 if (arg == "--top-k") { 429 if (++i >= argc) { 430 invalid_param = true; 431 return true; 432 } 433 sparams.top_k = std::stoi(argv[i]); 434 return true; 435 } 436 if (arg == "-c" || arg == "--ctx-size") { 437 if (++i >= argc) { 438 invalid_param = true; 439 return true; 440 } 441 params.n_ctx = std::stoi(argv[i]); 442 return true; 443 } 444 if (arg == "--grp-attn-n" || arg == "-gan") { 445 if (++i >= argc) { 446 invalid_param = true; 447 return true; 448 } 449 params.grp_attn_n = std::stoi(argv[i]); 450 return true; 451 } 452 if (arg == "--grp-attn-w" || arg == "-gaw") { 453 if (++i >= argc) { 454 invalid_param = true; 455 return true; 456 } 457 params.grp_attn_w = std::stoi(argv[i]); 458 return true; 459 } 460 if (arg == "--rope-freq-base") { 461 if (++i >= argc) { 462 invalid_param = true; 463 return true; 464 } 465 params.rope_freq_base = std::stof(argv[i]); 466 return true; 467 } 468 if (arg == "--rope-freq-scale") { 469 if (++i >= argc) { 470 invalid_param = true; 471 return true; 472 } 473 params.rope_freq_scale = std::stof(argv[i]); 474 return true; 475 } 476 if (arg == "--rope-scaling") { 477 if (++i >= argc) { 478 invalid_param = true; 479 return true; 480 } 481 std::string value(argv[i]); 482 /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } 483 else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } 484 else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } 485 else { invalid_param = true; } 486 return true; 487 } 488 if (arg == "--rope-scale") { 489 if (++i >= argc) { 490 invalid_param = true; 491 return true; 492 } 493 params.rope_freq_scale = 1.0f / std::stof(argv[i]); 494 return true; 495 } 496 if (arg == "--yarn-orig-ctx") { 497 if (++i >= argc) { 498 invalid_param = true; 499 return true; 500 } 501 params.yarn_orig_ctx = std::stoi(argv[i]); 502 return true; 503 } 504 if (arg == "--yarn-ext-factor") { 505 if (++i >= argc) { 506 invalid_param = true; 507 return true; 508 } 509 params.yarn_ext_factor = std::stof(argv[i]); 510 return true; 511 } 512 if (arg == "--yarn-attn-factor") { 513 if (++i >= argc) { 514 invalid_param = true; 515 return true; 516 } 517 params.yarn_attn_factor = std::stof(argv[i]); 518 return true; 519 } 520 if (arg == "--yarn-beta-fast") { 521 if (++i >= argc) { 522 invalid_param = true; 523 return true; 524 } 525 params.yarn_beta_fast = std::stof(argv[i]); 526 return true; 527 } 528 if (arg == "--yarn-beta-slow") { 529 if (++i >= argc) { 530 invalid_param = true; 531 return true; 532 } 533 params.yarn_beta_slow = std::stof(argv[i]); 534 return true; 535 } 536 if (arg == "--pooling") { 537 if (++i >= argc) { 538 invalid_param = true; 539 return true; 540 } 541 std::string value(argv[i]); 542 /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } 543 else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } 544 else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } 545 else { invalid_param = true; } 546 return true; 547 } 548 if (arg == "--defrag-thold" || arg == "-dt") { 549 if (++i >= argc) { 550 invalid_param = true; 551 return true; 552 } 553 params.defrag_thold = std::stof(argv[i]); 554 return true; 555 } 556 if (arg == "--samplers") { 557 if (++i >= argc) { 558 invalid_param = true; 559 return true; 560 } 561 const auto sampler_names = string_split(argv[i], ';'); 562 sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true); 563 return true; 564 } 565 if (arg == "--sampling-seq") { 566 if (++i >= argc) { 567 invalid_param = true; 568 return true; 569 } 570 sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]); 571 return true; 572 } 573 if (arg == "--top-p") { 574 if (++i >= argc) { 575 invalid_param = true; 576 return true; 577 } 578 sparams.top_p = std::stof(argv[i]); 579 return true; 580 } 581 if (arg == "--min-p") { 582 if (++i >= argc) { 583 invalid_param = true; 584 return true; 585 } 586 sparams.min_p = std::stof(argv[i]); 587 return true; 588 } 589 if (arg == "--temp") { 590 if (++i >= argc) { 591 invalid_param = true; 592 return true; 593 } 594 sparams.temp = std::stof(argv[i]); 595 sparams.temp = std::max(sparams.temp, 0.0f); 596 return true; 597 } 598 if (arg == "--tfs") { 599 if (++i >= argc) { 600 invalid_param = true; 601 return true; 602 } 603 sparams.tfs_z = std::stof(argv[i]); 604 return true; 605 } 606 if (arg == "--typical") { 607 if (++i >= argc) { 608 invalid_param = true; 609 return true; 610 } 611 sparams.typical_p = std::stof(argv[i]); 612 return true; 613 } 614 if (arg == "--repeat-last-n") { 615 if (++i >= argc) { 616 invalid_param = true; 617 return true; 618 } 619 sparams.penalty_last_n = std::stoi(argv[i]); 620 sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); 621 return true; 622 } 623 if (arg == "--repeat-penalty") { 624 if (++i >= argc) { 625 invalid_param = true; 626 return true; 627 } 628 sparams.penalty_repeat = std::stof(argv[i]); 629 return true; 630 } 631 if (arg == "--frequency-penalty") { 632 if (++i >= argc) { 633 invalid_param = true; 634 return true; 635 } 636 sparams.penalty_freq = std::stof(argv[i]); 637 return true; 638 } 639 if (arg == "--presence-penalty") { 640 if (++i >= argc) { 641 invalid_param = true; 642 return true; 643 } 644 sparams.penalty_present = std::stof(argv[i]); 645 return true; 646 } 647 if (arg == "--dynatemp-range") { 648 if (++i >= argc) { 649 invalid_param = true; 650 return true; 651 } 652 sparams.dynatemp_range = std::stof(argv[i]); 653 return true; 654 } 655 if (arg == "--dynatemp-exp") { 656 if (++i >= argc) { 657 invalid_param = true; 658 return true; 659 } 660 sparams.dynatemp_exponent = std::stof(argv[i]); 661 return true; 662 } 663 if (arg == "--mirostat") { 664 if (++i >= argc) { 665 invalid_param = true; 666 return true; 667 } 668 sparams.mirostat = std::stoi(argv[i]); 669 return true; 670 } 671 if (arg == "--mirostat-lr") { 672 if (++i >= argc) { 673 invalid_param = true; 674 return true; 675 } 676 sparams.mirostat_eta = std::stof(argv[i]); 677 return true; 678 } 679 if (arg == "--mirostat-ent") { 680 if (++i >= argc) { 681 invalid_param = true; 682 return true; 683 } 684 sparams.mirostat_tau = std::stof(argv[i]); 685 return true; 686 } 687 if (arg == "--cfg-negative-prompt") { 688 if (++i >= argc) { 689 invalid_param = true; 690 return true; 691 } 692 sparams.cfg_negative_prompt = argv[i]; 693 return true; 694 } 695 if (arg == "--cfg-negative-prompt-file") { 696 if (++i >= argc) { 697 invalid_param = true; 698 return true; 699 } 700 std::ifstream file(argv[i]); 701 if (!file) { 702 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); 703 invalid_param = true; 704 return true; 705 } 706 std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt)); 707 if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { 708 sparams.cfg_negative_prompt.pop_back(); 709 } 710 return true; 711 } 712 if (arg == "--cfg-scale") { 713 if (++i >= argc) { 714 invalid_param = true; 715 return true; 716 } 717 sparams.cfg_scale = std::stof(argv[i]); 718 return true; 719 } 720 if (arg == "-b" || arg == "--batch-size") { 721 if (++i >= argc) { 722 invalid_param = true; 723 return true; 724 } 725 params.n_batch = std::stoi(argv[i]); 726 return true; 727 } 728 if (arg == "-ub" || arg == "--ubatch-size") { 729 if (++i >= argc) { 730 invalid_param = true; 731 return true; 732 } 733 params.n_ubatch = std::stoi(argv[i]); 734 return true; 735 } 736 if (arg == "--keep") { 737 if (++i >= argc) { 738 invalid_param = true; 739 return true; 740 } 741 params.n_keep = std::stoi(argv[i]); 742 return true; 743 } 744 if (arg == "--draft") { 745 if (++i >= argc) { 746 invalid_param = true; 747 return true; 748 } 749 params.n_draft = std::stoi(argv[i]); 750 return true; 751 } 752 if (arg == "--chunks") { 753 if (++i >= argc) { 754 invalid_param = true; 755 return true; 756 } 757 params.n_chunks = std::stoi(argv[i]); 758 return true; 759 } 760 if (arg == "-np" || arg == "--parallel") { 761 if (++i >= argc) { 762 invalid_param = true; 763 return true; 764 } 765 params.n_parallel = std::stoi(argv[i]); 766 return true; 767 } 768 if (arg == "-ns" || arg == "--sequences") { 769 if (++i >= argc) { 770 invalid_param = true; 771 return true; 772 } 773 params.n_sequences = std::stoi(argv[i]); 774 return true; 775 } 776 if (arg == "--p-split" || arg == "-ps") { 777 if (++i >= argc) { 778 invalid_param = true; 779 return true; 780 } 781 params.p_split = std::stof(argv[i]); 782 return true; 783 } 784 if (arg == "-m" || arg == "--model") { 785 if (++i >= argc) { 786 invalid_param = true; 787 return true; 788 } 789 params.model = argv[i]; 790 return true; 791 } 792 if (arg == "-md" || arg == "--model-draft") { 793 if (++i >= argc) { 794 invalid_param = true; 795 return true; 796 } 797 params.model_draft = argv[i]; 798 return true; 799 } 800 if (arg == "-a" || arg == "--alias") { 801 if (++i >= argc) { 802 invalid_param = true; 803 return true; 804 } 805 params.model_alias = argv[i]; 806 return true; 807 } 808 if (arg == "-mu" || arg == "--model-url") { 809 if (++i >= argc) { 810 invalid_param = true; 811 return true; 812 } 813 params.model_url = argv[i]; 814 return true; 815 } 816 if (arg == "-hfr" || arg == "--hf-repo") { 817 if (++i >= argc) { 818 invalid_param = true; 819 return true; 820 } 821 params.hf_repo = argv[i]; 822 return true; 823 } 824 if (arg == "-hff" || arg == "--hf-file") { 825 if (++i >= argc) { 826 invalid_param = true; 827 return true; 828 } 829 params.hf_file = argv[i]; 830 return true; 831 } 832 if (arg == "--lora") { 833 if (++i >= argc) { 834 invalid_param = true; 835 return true; 836 } 837 params.lora_adapter.emplace_back(argv[i], 1.0f); 838 params.use_mmap = false; 839 return true; 840 } 841 if (arg == "--lora-scaled") { 842 if (++i >= argc) { 843 invalid_param = true; 844 return true; 845 } 846 const char* lora_adapter = argv[i]; 847 if (++i >= argc) { 848 invalid_param = true; 849 return true; 850 } 851 params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); 852 params.use_mmap = false; 853 return true; 854 } 855 if (arg == "--lora-base") { 856 if (++i >= argc) { 857 invalid_param = true; 858 return true; 859 } 860 params.lora_base = argv[i]; 861 return true; 862 } 863 if (arg == "--control-vector") { 864 if (++i >= argc) { 865 invalid_param = true; 866 return true; 867 } 868 params.control_vectors.push_back({ 1.0f, argv[i], }); 869 return true; 870 } 871 if (arg == "--control-vector-scaled") { 872 if (++i >= argc) { 873 invalid_param = true; 874 return true; 875 } 876 const char* fname = argv[i]; 877 if (++i >= argc) { 878 invalid_param = true; 879 return true; 880 } 881 params.control_vectors.push_back({ std::stof(argv[i]), fname, }); 882 return true; 883 } 884 if (arg == "--control-vector-layer-range") { 885 if (++i >= argc) { 886 invalid_param = true; 887 return true; 888 } 889 params.control_vector_layer_start = std::stoi(argv[i]); 890 if (++i >= argc) { 891 invalid_param = true; 892 return true; 893 } 894 params.control_vector_layer_end = std::stoi(argv[i]); 895 return true; 896 } 897 if (arg == "--mmproj") { 898 if (++i >= argc) { 899 invalid_param = true; 900 return true; 901 } 902 params.mmproj = argv[i]; 903 return true; 904 } 905 if (arg == "--image") { 906 if (++i >= argc) { 907 invalid_param = true; 908 return true; 909 } 910 params.image.emplace_back(argv[i]); 911 return true; 912 } 913 if (arg == "-i" || arg == "--interactive") { 914 params.interactive = true; 915 return true; 916 } 917 if (arg == "-sp" || arg == "--special") { 918 params.special = true; 919 return true; 920 } 921 if (arg == "--embedding" || arg == "--embeddings") { 922 params.embedding = true; 923 return true; 924 } 925 if (arg == "-if" || arg == "--interactive-first") { 926 params.interactive_first = true; 927 return true; 928 } 929 if (arg == "-cnv" || arg == "--conversation") { 930 params.conversation = true; 931 return true; 932 } 933 if (arg == "--infill") { 934 params.infill = true; 935 return true; 936 } 937 if (arg == "-dkvc" || arg == "--dump-kv-cache") { 938 params.dump_kv_cache = true; 939 return true; 940 } 941 if (arg == "-nkvo" || arg == "--no-kv-offload") { 942 params.no_kv_offload = true; 943 return true; 944 } 945 if (arg == "-ctk" || arg == "--cache-type-k") { 946 params.cache_type_k = argv[++i]; 947 return true; 948 } 949 if (arg == "-ctv" || arg == "--cache-type-v") { 950 params.cache_type_v = argv[++i]; 951 return true; 952 } 953 if (arg == "--multiline-input") { 954 params.multiline_input = true; 955 return true; 956 } 957 if (arg == "--simple-io") { 958 params.simple_io = true; 959 return true; 960 } 961 if (arg == "-cb" || arg == "--cont-batching") { 962 params.cont_batching = true; 963 return true; 964 } 965 if (arg == "-fa" || arg == "--flash-attn") { 966 params.flash_attn = true; 967 return true; 968 } 969 if (arg == "-co" || arg == "--color") { 970 params.use_color = true; 971 return true; 972 } 973 if (arg == "--mlock") { 974 params.use_mlock = true; 975 return true; 976 } 977 if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { 978 if (++i >= argc) { 979 invalid_param = true; 980 return true; 981 } 982 params.n_gpu_layers = std::stoi(argv[i]); 983 if (!llama_supports_gpu_offload()) { 984 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); 985 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); 986 } 987 return true; 988 } 989 if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { 990 if (++i >= argc) { 991 invalid_param = true; 992 return true; 993 } 994 params.n_gpu_layers_draft = std::stoi(argv[i]); 995 if (!llama_supports_gpu_offload()) { 996 fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); 997 fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); 998 } 999 return true; 1000 } 1001 if (arg == "--main-gpu" || arg == "-mg") { 1002 if (++i >= argc) { 1003 invalid_param = true; 1004 return true; 1005 } 1006 params.main_gpu = std::stoi(argv[i]); 1007 #ifndef GGML_USE_CUDA_SYCL_VULKAN 1008 fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); 1009 #endif // GGML_USE_CUDA_SYCL_VULKAN 1010 return true; 1011 } 1012 if (arg == "--split-mode" || arg == "-sm") { 1013 if (++i >= argc) { 1014 invalid_param = true; 1015 return true; 1016 } 1017 std::string arg_next = argv[i]; 1018 if (arg_next == "none") { 1019 params.split_mode = LLAMA_SPLIT_MODE_NONE; 1020 } 1021 else if (arg_next == "layer") { 1022 params.split_mode = LLAMA_SPLIT_MODE_LAYER; 1023 } 1024 else if (arg_next == "row") { 1025 #ifdef GGML_USE_SYCL 1026 fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); 1027 exit(1); 1028 #endif // GGML_USE_SYCL 1029 params.split_mode = LLAMA_SPLIT_MODE_ROW; 1030 } 1031 else { 1032 invalid_param = true; 1033 return true; 1034 } 1035 #ifndef GGML_USE_CUDA_SYCL_VULKAN 1036 fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); 1037 #endif // GGML_USE_CUDA_SYCL_VULKAN 1038 return true; 1039 } 1040 if (arg == "--tensor-split" || arg == "-ts") { 1041 if (++i >= argc) { 1042 invalid_param = true; 1043 return true; 1044 } 1045 std::string arg_next = argv[i]; 1046 1047 // split string by , and / 1048 const std::regex regex{ R"([,/]+)" }; 1049 std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; 1050 std::vector<std::string> split_arg{ it, {} }; 1051 if (split_arg.size() >= llama_max_devices()) { 1052 invalid_param = true; 1053 return true; 1054 } 1055 for (size_t i = 0; i < llama_max_devices(); ++i) { 1056 if (i < split_arg.size()) { 1057 params.tensor_split[i] = std::stof(split_arg[i]); 1058 } 1059 else { 1060 params.tensor_split[i] = 0.0f; 1061 } 1062 } 1063 #ifndef GGML_USE_CUDA_SYCL_VULKAN 1064 fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); 1065 #endif // GGML_USE_CUDA_SYCL_VULKAN 1066 return true; 1067 } 1068 if (arg == "--rpc") { 1069 if (++i >= argc) { 1070 invalid_param = true; 1071 return true; 1072 } 1073 params.rpc_servers = argv[i]; 1074 return true; 1075 } 1076 if (arg == "--no-mmap") { 1077 params.use_mmap = false; 1078 return true; 1079 } 1080 if (arg == "--numa") { 1081 if (++i >= argc) { 1082 invalid_param = true; 1083 return true; 1084 } 1085 std::string value(argv[i]); 1086 /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } 1087 else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } 1088 else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } 1089 else { invalid_param = true; } 1090 return true; 1091 } 1092 if (arg == "-v" || arg == "--verbose") { 1093 params.verbosity = 1; 1094 return true; 1095 } 1096 if (arg == "--verbosity") { 1097 if (++i >= argc) { 1098 invalid_param = true; 1099 return true; 1100 } 1101 params.verbosity = std::stoi(argv[i]); 1102 return true; 1103 } 1104 if (arg == "--verbose-prompt") { 1105 params.verbose_prompt = true; 1106 return true; 1107 } 1108 if (arg == "--no-display-prompt") { 1109 params.display_prompt = false; 1110 return true; 1111 } 1112 if (arg == "-r" || arg == "--reverse-prompt") { 1113 if (++i >= argc) { 1114 invalid_param = true; 1115 return true; 1116 } 1117 params.antiprompt.emplace_back(argv[i]); 1118 return true; 1119 } 1120 if (arg == "-ld" || arg == "--logdir") { 1121 if (++i >= argc) { 1122 invalid_param = true; 1123 return true; 1124 } 1125 params.logdir = argv[i]; 1126 1127 if (params.logdir.back() != DIRECTORY_SEPARATOR) { 1128 params.logdir += DIRECTORY_SEPARATOR; 1129 } 1130 return true; 1131 } 1132 if (arg == "-lcs" || arg == "--lookup-cache-static") { 1133 if (++i >= argc) { 1134 invalid_param = true; 1135 return true; 1136 } 1137 params.lookup_cache_static = argv[i]; 1138 return true; 1139 } 1140 if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { 1141 if (++i >= argc) { 1142 invalid_param = true; 1143 return true; 1144 } 1145 params.lookup_cache_dynamic = argv[i]; 1146 return true; 1147 } 1148 if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { 1149 if (++i >= argc) { 1150 invalid_param = true; 1151 return true; 1152 } 1153 params.logits_file = argv[i]; 1154 return true; 1155 } 1156 if (arg == "--perplexity" || arg == "--all-logits") { 1157 params.logits_all = true; 1158 return true; 1159 } 1160 if (arg == "--ppl-stride") { 1161 if (++i >= argc) { 1162 invalid_param = true; 1163 return true; 1164 } 1165 params.ppl_stride = std::stoi(argv[i]); 1166 return true; 1167 } 1168 if (arg == "--ppl-output-type") { 1169 if (++i >= argc) { 1170 invalid_param = true; 1171 return true; 1172 } 1173 params.ppl_output_type = std::stoi(argv[i]); 1174 return true; 1175 } 1176 if (arg == "-ptc" || arg == "--print-token-count") { 1177 if (++i >= argc) { 1178 invalid_param = true; 1179 return true; 1180 } 1181 params.n_print = std::stoi(argv[i]); 1182 return true; 1183 } 1184 if (arg == "--check-tensors") { 1185 params.check_tensors = true; 1186 return true; 1187 } 1188 if (arg == "--hellaswag") { 1189 params.hellaswag = true; 1190 return true; 1191 } 1192 if (arg == "--hellaswag-tasks") { 1193 if (++i >= argc) { 1194 invalid_param = true; 1195 return true; 1196 } 1197 params.hellaswag_tasks = std::stoi(argv[i]); 1198 return true; 1199 } 1200 if (arg == "--winogrande") { 1201 params.winogrande = true; 1202 return true; 1203 } 1204 if (arg == "--winogrande-tasks") { 1205 if (++i >= argc) { 1206 invalid_param = true; 1207 return true; 1208 } 1209 params.winogrande_tasks = std::stoi(argv[i]); 1210 return true; 1211 } 1212 if (arg == "--multiple-choice") { 1213 params.multiple_choice = true; 1214 return true; 1215 } 1216 if (arg == "--multiple-choice-tasks") { 1217 if (++i >= argc) { 1218 invalid_param = true; 1219 return true; 1220 } 1221 params.multiple_choice_tasks = std::stoi(argv[i]); 1222 return true; 1223 } 1224 if (arg == "--kl-divergence") { 1225 params.kl_divergence = true; 1226 return true; 1227 } 1228 if (arg == "--ignore-eos") { 1229 params.ignore_eos = true; 1230 return true; 1231 } 1232 if (arg == "--penalize-nl") { 1233 sparams.penalize_nl = true; 1234 return true; 1235 } 1236 if (arg == "-l" || arg == "--logit-bias") { 1237 if (++i >= argc) { 1238 invalid_param = true; 1239 return true; 1240 } 1241 std::stringstream ss(argv[i]); 1242 llama_token key; 1243 char sign; 1244 std::string value_str; 1245 try { 1246 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { 1247 sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); 1248 } 1249 else { 1250 throw std::exception(); 1251 } 1252 } 1253 catch (const std::exception&) { 1254 invalid_param = true; 1255 return true; 1256 } 1257 return true; 1258 } 1259 if (arg == "-h" || arg == "--help" || arg == "--usage" ) { 1260 params.usage = true; 1261 return true; 1262 } 1263 if (arg == "--version") { 1264 fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); 1265 fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); 1266 exit(0); 1267 } 1268 if (arg == "--in-prefix-bos") { 1269 params.input_prefix_bos = true; 1270 return true; 1271 } 1272 if (arg == "--in-prefix") { 1273 if (++i >= argc) { 1274 invalid_param = true; 1275 return true; 1276 } 1277 params.input_prefix = argv[i]; 1278 return true; 1279 } 1280 if (arg == "--in-suffix") { 1281 if (++i >= argc) { 1282 invalid_param = true; 1283 return true; 1284 } 1285 params.input_suffix = argv[i]; 1286 return true; 1287 } 1288 if (arg == "--grammar") { 1289 if (++i >= argc) { 1290 invalid_param = true; 1291 return true; 1292 } 1293 sparams.grammar = argv[i]; 1294 return true; 1295 } 1296 if (arg == "--grammar-file") { 1297 if (++i >= argc) { 1298 invalid_param = true; 1299 return true; 1300 } 1301 std::ifstream file(argv[i]); 1302 if (!file) { 1303 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); 1304 invalid_param = true; 1305 return true; 1306 } 1307 std::copy( 1308 std::istreambuf_iterator<char>(file), 1309 std::istreambuf_iterator<char>(), 1310 std::back_inserter(sparams.grammar) 1311 ); 1312 return true; 1313 } 1314 if (arg == "-j" || arg == "--json-schema") { 1315 if (++i >= argc) { 1316 invalid_param = true; 1317 return true; 1318 } 1319 sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); 1320 return true; 1321 } 1322 if (arg == "--override-kv") { 1323 if (++i >= argc) { 1324 invalid_param = true; 1325 return true; 1326 } 1327 if (!string_parse_kv_override(argv[i], params.kv_overrides)) { 1328 fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); 1329 invalid_param = true; 1330 return true; 1331 } 1332 return true; 1333 } 1334 if (arg == "--host") { 1335 if (++i >= argc) { 1336 invalid_param = true; 1337 return true; 1338 } 1339 params.hostname = argv[i]; 1340 return true; 1341 } 1342 if (arg == "--port") { 1343 if (++i >= argc) { 1344 invalid_param = true; 1345 return true; 1346 } 1347 params.port = std::stoi(argv[i]); 1348 return true; 1349 } 1350 if (arg == "--path") { 1351 if (++i >= argc) { 1352 invalid_param = true; 1353 return true; 1354 } 1355 params.public_path = argv[i]; 1356 return true; 1357 } 1358 if (arg == "--api-key") { 1359 if (++i >= argc) { 1360 invalid_param = true; 1361 return true; 1362 } 1363 params.api_keys.push_back(argv[i]); 1364 return true; 1365 } 1366 if (arg == "--api-key-file") { 1367 if (++i >= argc) { 1368 invalid_param = true; 1369 return true; 1370 } 1371 std::ifstream key_file(argv[i]); 1372 if (!key_file) { 1373 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); 1374 invalid_param = true; 1375 return true; 1376 } 1377 std::string key; 1378 while (std::getline(key_file, key)) { 1379 if (!key.empty()) { 1380 params.api_keys.push_back(key); 1381 } 1382 } 1383 key_file.close(); 1384 return true; 1385 } 1386 if (arg == "--ssl-key-file") { 1387 if (++i >= argc) { 1388 invalid_param = true; 1389 return true; 1390 } 1391 params.ssl_file_key = argv[i]; 1392 return true; 1393 } 1394 if (arg == "--ssl-cert-file") { 1395 if (++i >= argc) { 1396 invalid_param = true; 1397 return true; 1398 } 1399 params.ssl_file_cert = argv[i]; 1400 return true; 1401 } 1402 if (arg == "--timeout" || arg == "-to") { 1403 if (++i >= argc) { 1404 invalid_param = true; 1405 return true; 1406 } 1407 params.timeout_read = std::stoi(argv[i]); 1408 params.timeout_write = std::stoi(argv[i]); 1409 return true; 1410 } 1411 if (arg == "--threads-http") { 1412 if (++i >= argc) { 1413 invalid_param = true; 1414 return true; 1415 } 1416 params.n_threads_http = std::stoi(argv[i]); 1417 return true; 1418 } 1419 if (arg == "-spf" || arg == "--system-prompt-file") { 1420 if (++i >= argc) { 1421 invalid_param = true; 1422 return true; 1423 } 1424 std::ifstream file(argv[i]); 1425 if (!file) { 1426 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); 1427 invalid_param = true; 1428 return true; 1429 } 1430 std::string system_prompt; 1431 std::copy( 1432 std::istreambuf_iterator<char>(file), 1433 std::istreambuf_iterator<char>(), 1434 std::back_inserter(system_prompt) 1435 ); 1436 params.system_prompt = system_prompt; 1437 return true; 1438 } 1439 if (arg == "--log-format") { 1440 if (++i >= argc) { 1441 invalid_param = true; 1442 return true; 1443 } 1444 if (std::strcmp(argv[i], "json") == 0) { 1445 params.log_json = true; 1446 } else if (std::strcmp(argv[i], "text") == 0) { 1447 params.log_json = false; 1448 } else { 1449 invalid_param = true; 1450 return true; 1451 } 1452 return true; 1453 } 1454 if (arg == "--no-slots") { 1455 params.endpoint_slots = false; 1456 return true; 1457 } 1458 if (arg == "--metrics") { 1459 params.endpoint_metrics = true; 1460 return true; 1461 } 1462 if (arg == "--slot-save-path") { 1463 if (++i >= argc) { 1464 invalid_param = true; 1465 return true; 1466 } 1467 params.slot_save_path = argv[i]; 1468 // if doesn't end with DIRECTORY_SEPARATOR, add it 1469 if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { 1470 params.slot_save_path += DIRECTORY_SEPARATOR; 1471 } 1472 return true; 1473 } 1474 if (arg == "--chat-template") { 1475 if (++i >= argc) { 1476 invalid_param = true; 1477 return true; 1478 } 1479 if (!llama_chat_verify_template(argv[i])) { 1480 fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); 1481 fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); 1482 invalid_param = true; 1483 return true; 1484 } 1485 params.chat_template = argv[i]; 1486 return true; 1487 } 1488 if (arg == "--slot-prompt-similarity" || arg == "-sps") { 1489 if (++i >= argc) { 1490 invalid_param = true; 1491 return true; 1492 } 1493 params.slot_prompt_similarity = std::stof(argv[i]); 1494 return true; 1495 } 1496 if (arg == "-pps") { 1497 params.is_pp_shared = true; 1498 return true; 1499 } 1500 if (arg == "-npp") { 1501 if (++i >= argc) { 1502 invalid_param = true; 1503 return true; 1504 } 1505 auto p = string_split<int>(argv[i], split_delim); 1506 params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); 1507 return true; 1508 } 1509 if (arg == "-ntg") { 1510 if (++i >= argc) { 1511 invalid_param = true; 1512 return true; 1513 } 1514 auto p = string_split<int>(argv[i], split_delim); 1515 params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); 1516 return true; 1517 } 1518 if (arg == "-npl") { 1519 if (++i >= argc) { 1520 invalid_param = true; 1521 return true; 1522 } 1523 auto p = string_split<int>(argv[i], split_delim); 1524 params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); 1525 return true; 1526 } 1527 if (arg == "--context-file") { 1528 if (++i >= argc) { 1529 invalid_param = true; 1530 return true; 1531 } 1532 std::ifstream file(argv[i], std::ios::binary); 1533 if (!file) { 1534 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); 1535 invalid_param = true; 1536 return true; 1537 } 1538 params.context_files.push_back(argv[i]); 1539 return true; 1540 } 1541 if (arg == "--chunk-size") { 1542 if (++i >= argc) { 1543 invalid_param = true; 1544 return true; 1545 } 1546 params.chunk_size = std::stoi(argv[i]); 1547 return true; 1548 } 1549 if (arg == "--chunk-separator") { 1550 if (++i >= argc) { 1551 invalid_param = true; 1552 return true; 1553 } 1554 params.chunk_separator = argv[i]; 1555 return true; 1556 } 1557 if (arg == "--junk") { 1558 if (++i >= argc) { 1559 invalid_param = true; 1560 return true; 1561 } 1562 params.n_junk = std::stoi(argv[i]); 1563 return true; 1564 } 1565 if (arg == "--pos") { 1566 if (++i >= argc) { 1567 invalid_param = true; 1568 return true; 1569 } 1570 params.i_pos = std::stoi(argv[i]); 1571 return true; 1572 } 1573 if (arg == "-o" || arg == "--output" || arg == "--output-file") { 1574 if (++i >= argc) { 1575 invalid_param = true; 1576 return true; 1577 } 1578 params.out_file = argv[i]; 1579 params.cvector_outfile = argv[i]; 1580 return true; 1581 } 1582 if (arg == "-ofreq" || arg == "--output-frequency") { 1583 if (++i >= argc) { 1584 invalid_param = true; 1585 return true; 1586 } 1587 params.n_out_freq = std::stoi(argv[i]); 1588 return true; 1589 } 1590 if (arg == "--save-frequency") { 1591 if (++i >= argc) { 1592 invalid_param = true; 1593 return true; 1594 } 1595 params.n_save_freq = std::stoi(argv[i]); 1596 return true; 1597 } 1598 if (arg == "--process-output") { 1599 params.process_output = true; 1600 return true; 1601 } 1602 if (arg == "--no-ppl") { 1603 params.compute_ppl = false; 1604 return true; 1605 } 1606 if (arg == "--chunk" || arg == "--from-chunk") { 1607 if (++i >= argc) { 1608 invalid_param = true; 1609 return true; 1610 } 1611 params.i_chunk = std::stoi(argv[i]); 1612 return true; 1613 } 1614 // cvector params 1615 if (arg == "--completions-file") { 1616 if (++i >= argc) { 1617 invalid_param = true; 1618 return true; 1619 } 1620 params.cvector_completions_file = argv[i]; 1621 return true; 1622 } 1623 if (arg == "--positive-file") { 1624 if (++i >= argc) { 1625 invalid_param = true; 1626 return true; 1627 } 1628 params.cvector_positive_file = argv[i]; 1629 return true; 1630 } 1631 if (arg == "--negative-file") { 1632 if (++i >= argc) { 1633 invalid_param = true; 1634 return true; 1635 } 1636 params.cvector_negative_file = argv[i]; 1637 return true; 1638 } 1639 if (arg == "--completions") { 1640 if (++i >= argc) { 1641 invalid_param = true; 1642 return true; 1643 } 1644 params.n_completions = std::stoi(argv[i]); 1645 return true; 1646 } 1647 if (arg == "--pca-batch") { 1648 if (++i >= argc) { 1649 invalid_param = true; 1650 return true; 1651 } 1652 params.n_pca_batch = std::stoi(argv[i]); 1653 return true; 1654 } 1655 if (arg == "--pca-iter") { 1656 if (++i >= argc) { 1657 invalid_param = true; 1658 return true; 1659 } 1660 params.n_pca_iterations = std::stoi(argv[i]); 1661 return true; 1662 } 1663 #ifndef LOG_DISABLE_LOGS 1664 // Parse args for logging parameters 1665 if (log_param_single_parse(argv[i])) { 1666 // Do nothing, log_param_single_parse automatically does it's thing 1667 // and returns if a match was found and parsed. 1668 return true; 1669 } 1670 if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { 1671 // We have a matching known parameter requiring an argument, 1672 // now we need to check if there is anything after this argv 1673 // and flag invalid_param or parse it. 1674 if (++i >= argc) { 1675 invalid_param = true; 1676 return true; 1677 } 1678 if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { 1679 invalid_param = true; 1680 return true; 1681 } 1682 return true; 1683 } 1684 // End of Parse args for logging parameters 1685 #endif // LOG_DISABLE_LOGS 1686 1687 return false; 1688 } 1689 1690 #ifdef __GNUC__ 1691 #ifdef __MINGW32__ 1692 #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) 1693 #else 1694 #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) 1695 #endif 1696 #else 1697 #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) 1698 #endif 1699 1700 void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { 1701 const llama_sampling_params & sparams = params.sparams; 1702 1703 std::string sampler_type_chars; 1704 std::string sampler_type_names; 1705 for (const auto sampler_type : sparams.samplers_sequence) { 1706 sampler_type_chars += static_cast<char>(sampler_type); 1707 sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";"; 1708 } 1709 sampler_type_names.pop_back(); 1710 1711 struct option_info { 1712 LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) 1713 option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) { 1714 va_list args_list; 1715 va_start(args_list, desc); 1716 char buffer[1024]; 1717 vsnprintf(buffer, sizeof(buffer), desc, args_list); 1718 va_end(args_list); 1719 this->desc = buffer; 1720 } 1721 1722 option_info(const std::string & grp) : grp(grp) {} 1723 1724 std::string tags; 1725 std::string args; 1726 std::string desc; 1727 std::string grp; 1728 }; 1729 1730 std::vector<option_info> options; 1731 1732 // TODO: filter by tags 1733 1734 options.push_back({ "general" }); 1735 options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); 1736 options.push_back({ "*", " --version", "show version and build info" }); 1737 options.push_back({ "*", "-v, --verbose", "print verbose information" }); 1738 options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity }); 1739 options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); 1740 options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); 1741 options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); 1742 options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); 1743 options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); 1744 options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); 1745 options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); 1746 options.push_back({ "speculative", "-tbd, --threads-batch-draft N", 1747 "number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); 1748 options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); 1749 options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); 1750 options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", 1751 "path to static lookup cache to use for lookup decoding (not updated by generation)" }); 1752 options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME", 1753 "path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); 1754 1755 options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); 1756 options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); 1757 options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); 1758 options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); 1759 options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); 1760 options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); 1761 options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); 1762 options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); 1763 options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); 1764 options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); 1765 options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); 1766 options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); 1767 options.push_back({ "*", " --no-escape", "do not process escape sequences" }); 1768 options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print }); 1769 options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" }); 1770 options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n" 1771 "not supported with --interactive or other interactive options" }); 1772 options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" }); 1773 options.push_back({ "main", "-r, --reverse-prompt PROMPT", 1774 "halt generation at PROMPT, return control in interactive mode\n" 1775 "can be specified more than once for multiple prompts" }); 1776 options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); 1777 options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" }); 1778 options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); 1779 options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); 1780 options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); 1781 options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); 1782 options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); 1783 options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); 1784 1785 options.push_back({ "sampling" }); 1786 options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" 1787 "(default: %s)", sampler_type_names.c_str() }); 1788 options.push_back({ "*", " --sampling-seq SEQUENCE", 1789 "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() }); 1790 options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" }); 1791 options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" }); 1792 options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp }); 1793 options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k }); 1794 options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p }); 1795 options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p }); 1796 options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z }); 1797 options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p }); 1798 options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n }); 1799 options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat }); 1800 options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present }); 1801 options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq }); 1802 options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range }); 1803 options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent }); 1804 options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n" 1805 "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" 1806 "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat }); 1807 options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta }); 1808 options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau }); 1809 options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" 1810 "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" 1811 "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); 1812 options.push_back({ "main", " --cfg-negative-prompt PROMPT", 1813 "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() }); 1814 options.push_back({ "main", " --cfg-negative-prompt-file FNAME", 1815 "negative prompt file to use for guidance" }); 1816 options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale }); 1817 1818 options.push_back({ "grammar" }); 1819 options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); 1820 options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); 1821 options.push_back({ "*", "-j, --json-schema SCHEMA", 1822 "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n" 1823 "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); 1824 1825 options.push_back({ "embedding" }); 1826 options.push_back({ "embedding", " --pooling {none,mean,cls}", 1827 "pooling type for embeddings, use model default if unspecified" }); 1828 1829 options.push_back({ "context hacking" }); 1830 options.push_back({ "*", " --rope-scaling {none,linear,yarn}", 1831 "RoPE frequency scaling method, defaults to linear unless specified by the model" }); 1832 options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" }); 1833 options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" }); 1834 options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" }); 1835 options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx }); 1836 options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor }); 1837 options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor }); 1838 options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow }); 1839 options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast }); 1840 options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n }); 1841 options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w }); 1842 options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" }); 1843 options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" }); 1844 options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() }); 1845 options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() }); 1846 1847 options.push_back({ "perplexity" }); 1848 options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" }); 1849 options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" }); 1850 options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks }); 1851 options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" }); 1852 options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks }); 1853 options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" }); 1854 options.push_back({ "perplexity", " --multiple-choice-tasks N", 1855 "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks }); 1856 options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" }); 1857 options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride }); 1858 options.push_back({ "perplexity", " --ppl-output-type {0,1}", 1859 "output type for perplexity calculation (default: %d)", params.ppl_output_type }); 1860 1861 options.push_back({ "parallel" }); 1862 options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold }); 1863 options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); 1864 options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); 1865 options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); 1866 1867 options.push_back({ "multi-modality" }); 1868 options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); 1869 options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); 1870 1871 options.push_back({ "backend" }); 1872 options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); 1873 if (llama_supports_mlock()) { 1874 options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); 1875 } 1876 if (llama_supports_mmap()) { 1877 options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" }); 1878 } 1879 options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n" 1880 " - distribute: spread execution evenly over all nodes\n" 1881 " - isolate: only spawn threads on CPUs on the node that execution started on\n" 1882 " - numactl: use the CPU map provided by numactl\n" 1883 "if run without this previously, it is recommended to drop the system page cache before using this\n" 1884 "see https://github.com/ggerganov/llama.cpp/issues/1437" }); 1885 1886 if (llama_supports_gpu_offload()) { 1887 options.push_back({ "*", "-ngl, --gpu-layers N", 1888 "number of layers to store in VRAM" }); 1889 options.push_back({ "*", "-ngld, --gpu-layers-draft N", 1890 "number of layers to store in VRAM for the draft model" }); 1891 options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", 1892 "how to split the model across multiple GPUs, one of:\n" 1893 " - none: use one GPU only\n" 1894 " - layer (default): split layers and KV across GPUs\n" 1895 " - row: split rows across GPUs" }); 1896 options.push_back({ "*", "-ts, --tensor-split SPLIT", 1897 "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); 1898 options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" 1899 "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); 1900 } 1901 1902 options.push_back({ "model" }); 1903 options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" }); 1904 options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", 1905 "advanced option to override model metadata by key. may be specified multiple times.\n" 1906 "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); 1907 options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); 1908 options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); 1909 options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); 1910 options.push_back({ "*", " --control-vector FNAME", "add a control vector" }); 1911 options.push_back({ "*", " --control-vector-scaled FNAME SCALE", 1912 "add a control vector with user defined scaling SCALE" }); 1913 options.push_back({ "*", " --control-vector-layer-range START END", 1914 "layer range to apply the control vector(s) to, start and end inclusive" }); 1915 options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" 1916 "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH }); 1917 options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" }); 1918 options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); 1919 options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); 1920 options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); 1921 1922 options.push_back({ "retrieval" }); 1923 options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); 1924 options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size }); 1925 options.push_back({ "retrieval", " --chunk-separator STRING", 1926 "separator between chunks (default: '%s')", params.chunk_separator.c_str() }); 1927 1928 options.push_back({ "passkey" }); 1929 options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); 1930 options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); 1931 1932 options.push_back({ "imatrix" }); 1933 options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() }); 1934 options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq }); 1935 options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); 1936 options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); 1937 options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); 1938 options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk }); 1939 1940 options.push_back({ "bench" }); 1941 options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); 1942 options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); 1943 options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); 1944 options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); 1945 1946 options.push_back({ "server" }); 1947 options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); 1948 options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); 1949 options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); 1950 options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); 1951 options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); 1952 options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); 1953 options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); 1954 options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); 1955 options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); 1956 options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http }); 1957 options.push_back({ "server", " --system-prompt-file FNAME", 1958 "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); 1959 options.push_back({ "server", " --log-format {text,json}", 1960 "log output format: json or text (default: json)" }); 1961 options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" }); 1962 options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" }); 1963 options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" }); 1964 options.push_back({ "server", " --chat-template JINJA_TEMPLATE", 1965 "set custom jinja chat template (default: template taken from model's metadata)\n" 1966 "only commonly used templates are accepted:\n" 1967 "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); 1968 options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", 1969 "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); 1970 1971 #ifndef LOG_DISABLE_LOGS 1972 options.push_back({ "logging" }); 1973 options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" }); 1974 options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" }); 1975 options.push_back({ "logging", " --log-test", "Run simple logging test" }); 1976 options.push_back({ "logging", " --log-disable", "Disable trace logs" }); 1977 options.push_back({ "logging", " --log-enable", "Enable trace logs" }); 1978 options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" }); 1979 options.push_back({ "logging", " --log-new", "Create a separate new log file on start. " 1980 "Each log file will have unique name: \"<name>.<ID>.log\"" }); 1981 options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); 1982 #endif // LOG_DISABLE_LOGS 1983 1984 options.push_back({ "cvector" }); 1985 options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); 1986 options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); 1987 options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); 1988 options.push_back({ "cvector", " --completions-file FNAME", 1989 "completions file (default: '%s')", params.cvector_completions_file.c_str() }); 1990 options.push_back({ "cvector", " --completions N", "number of lines of completions file to use (default: %d)", params.n_completions }); 1991 options.push_back({ "cvector", " --batch-pca N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); 1992 options.push_back({ "cvector", " --iter-pca N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); 1993 1994 printf("usage: %s [options]\n", argv[0]); 1995 1996 for (const auto & o : options) { 1997 if (!o.grp.empty()) { 1998 printf("\n%s:\n\n", o.grp.c_str()); 1999 continue; 2000 } 2001 printf(" %-32s", o.args.c_str()); 2002 if (o.args.length() > 30) { 2003 printf("\n%34s", ""); 2004 } 2005 2006 const auto desc = o.desc; 2007 size_t start = 0; 2008 size_t end = desc.find('\n'); 2009 while (end != std::string::npos) { 2010 printf("%s\n%34s", desc.substr(start, end - start).c_str(), ""); 2011 start = end + 1; 2012 end = desc.find('\n', start); 2013 } 2014 2015 printf("%s\n", desc.substr(start).c_str()); 2016 } 2017 printf("\n"); 2018 } 2019 2020 std::string gpt_params_get_system_info(const gpt_params & params) { 2021 std::ostringstream os; 2022 2023 os << "system_info: n_threads = " << params.n_threads; 2024 if (params.n_threads_batch != -1) { 2025 os << " (n_threads_batch = " << params.n_threads_batch << ")"; 2026 } 2027 os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info(); 2028 2029 return os.str(); 2030 } 2031 2032 // 2033 // String utils 2034 // 2035 2036 std::vector<std::string> string_split(std::string input, char separator) { 2037 std::vector<std::string> parts; 2038 size_t separator_pos = input.find(separator); 2039 while (separator_pos != std::string::npos) { 2040 std::string part = input.substr(0, separator_pos); 2041 parts.emplace_back(part); 2042 input = input.substr(separator_pos + 1); 2043 separator_pos = input.find(separator); 2044 } 2045 parts.emplace_back(input); 2046 return parts; 2047 } 2048 2049 std::string string_strip(const std::string & str) { 2050 size_t start = 0; 2051 size_t end = str.size(); 2052 while (start < end && std::isspace(str[start])) { 2053 start++; 2054 } 2055 while (end > start && std::isspace(str[end - 1])) { 2056 end--; 2057 } 2058 return str.substr(start, end - start); 2059 } 2060 2061 std::string string_get_sortable_timestamp() { 2062 using clock = std::chrono::system_clock; 2063 2064 const clock::time_point current_time = clock::now(); 2065 const time_t as_time_t = clock::to_time_t(current_time); 2066 char timestamp_no_ns[100]; 2067 std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t)); 2068 2069 const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>( 2070 current_time.time_since_epoch() % 1000000000).count(); 2071 char timestamp_ns[11]; 2072 snprintf(timestamp_ns, 11, "%09" PRId64, ns); 2073 2074 return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); 2075 } 2076 2077 void string_process_escapes(std::string & input) { 2078 std::size_t input_len = input.length(); 2079 std::size_t output_idx = 0; 2080 2081 for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) { 2082 if (input[input_idx] == '\\' && input_idx + 1 < input_len) { 2083 switch (input[++input_idx]) { 2084 case 'n': input[output_idx++] = '\n'; break; 2085 case 'r': input[output_idx++] = '\r'; break; 2086 case 't': input[output_idx++] = '\t'; break; 2087 case '\'': input[output_idx++] = '\''; break; 2088 case '\"': input[output_idx++] = '\"'; break; 2089 case '\\': input[output_idx++] = '\\'; break; 2090 case 'x': 2091 // Handle \x12, etc 2092 if (input_idx + 2 < input_len) { 2093 const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 }; 2094 char *err_p = nullptr; 2095 const long val = std::strtol(x, &err_p, 16); 2096 if (err_p == x + 2) { 2097 input_idx += 2; 2098 input[output_idx++] = char(val); 2099 break; 2100 } 2101 } 2102 // fall through 2103 default: input[output_idx++] = '\\'; 2104 input[output_idx++] = input[input_idx]; break; 2105 } 2106 } else { 2107 input[output_idx++] = input[input_idx]; 2108 } 2109 } 2110 2111 input.resize(output_idx); 2112 } 2113 2114 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) { 2115 const char * sep = strchr(data, '='); 2116 if (sep == nullptr || sep - data >= 128) { 2117 fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); 2118 return false; 2119 } 2120 llama_model_kv_override kvo; 2121 std::strncpy(kvo.key, data, sep - data); 2122 kvo.key[sep - data] = 0; 2123 sep++; 2124 if (strncmp(sep, "int:", 4) == 0) { 2125 sep += 4; 2126 kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; 2127 kvo.val_i64 = std::atol(sep); 2128 } else if (strncmp(sep, "float:", 6) == 0) { 2129 sep += 6; 2130 kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; 2131 kvo.val_f64 = std::atof(sep); 2132 } else if (strncmp(sep, "bool:", 5) == 0) { 2133 sep += 5; 2134 kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; 2135 if (std::strcmp(sep, "true") == 0) { 2136 kvo.val_bool = true; 2137 } else if (std::strcmp(sep, "false") == 0) { 2138 kvo.val_bool = false; 2139 } else { 2140 fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); 2141 return false; 2142 } 2143 } else if (strncmp(sep, "str:", 4) == 0) { 2144 sep += 4; 2145 kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; 2146 if (strlen(sep) > 127) { 2147 fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); 2148 return false; 2149 } 2150 strncpy(kvo.val_str, sep, 127); 2151 kvo.val_str[127] = '\0'; 2152 } else { 2153 fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); 2154 return false; 2155 } 2156 overrides.emplace_back(std::move(kvo)); 2157 return true; 2158 } 2159 2160 // 2161 // Filesystem utils 2162 // 2163 2164 // Validate if a filename is safe to use 2165 // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function 2166 bool fs_validate_filename(const std::string & filename) { 2167 if (!filename.length()) { 2168 // Empty filename invalid 2169 return false; 2170 } 2171 if (filename.length() > 255) { 2172 // Limit at common largest possible filename on Linux filesystems 2173 // to avoid unnecessary further validation 2174 // (On systems with smaller limits it will be caught by the OS) 2175 return false; 2176 } 2177 2178 std::u32string filename_utf32; 2179 try { 2180 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter; 2181 filename_utf32 = converter.from_bytes(filename); 2182 2183 // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used, 2184 // or invalid encodings were encountered. Reject such attempts 2185 std::string filename_reencoded = converter.to_bytes(filename_utf32); 2186 if (filename_reencoded != filename) { 2187 return false; 2188 } 2189 } catch (const std::exception &) { 2190 return false; 2191 } 2192 2193 // Check for forbidden codepoints: 2194 // - Control characters 2195 // - Unicode equivalents of illegal characters 2196 // - UTF-16 surrogate pairs 2197 // - UTF-8 replacement character 2198 // - Byte order mark (BOM) 2199 // - Illegal characters: / \ : * ? " < > | 2200 for (char32_t c : filename_utf32) { 2201 if (c <= 0x1F // Control characters (C0) 2202 || c == 0x7F // Control characters (DEL) 2203 || (c >= 0x80 && c <= 0x9F) // Control characters (C1) 2204 || c == 0xFF0E // Fullwidth Full Stop (period equivalent) 2205 || c == 0x2215 // Division Slash (forward slash equivalent) 2206 || c == 0x2216 // Set Minus (backslash equivalent) 2207 || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs 2208 || c == 0xFFFD // Replacement Character (UTF-8) 2209 || c == 0xFEFF // Byte Order Mark (BOM) 2210 || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters 2211 || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') { 2212 return false; 2213 } 2214 } 2215 2216 // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename 2217 // Unicode and other whitespace is not affected, only 0x20 space 2218 if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') { 2219 return false; 2220 } 2221 2222 // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead) 2223 if (filename.find("..") != std::string::npos) { 2224 return false; 2225 } 2226 2227 // Reject "." 2228 if (filename == ".") { 2229 return false; 2230 } 2231 2232 return true; 2233 } 2234 2235 // returns true if successful, false otherwise 2236 bool fs_create_directory_with_parents(const std::string & path) { 2237 #ifdef _WIN32 2238 std::wstring_convert<std::codecvt_utf8<wchar_t>> converter; 2239 std::wstring wpath = converter.from_bytes(path); 2240 2241 // if the path already exists, check whether it's a directory 2242 const DWORD attributes = GetFileAttributesW(wpath.c_str()); 2243 if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { 2244 return true; 2245 } 2246 2247 size_t pos_slash = 0; 2248 2249 // process path from front to back, procedurally creating directories 2250 while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { 2251 const std::wstring subpath = wpath.substr(0, pos_slash); 2252 const wchar_t * test = subpath.c_str(); 2253 2254 const bool success = CreateDirectoryW(test, NULL); 2255 if (!success) { 2256 const DWORD error = GetLastError(); 2257 2258 // if the path already exists, ensure that it's a directory 2259 if (error == ERROR_ALREADY_EXISTS) { 2260 const DWORD attributes = GetFileAttributesW(subpath.c_str()); 2261 if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) { 2262 return false; 2263 } 2264 } else { 2265 return false; 2266 } 2267 } 2268 2269 pos_slash += 1; 2270 } 2271 2272 return true; 2273 #else 2274 // if the path already exists, check whether it's a directory 2275 struct stat info; 2276 if (stat(path.c_str(), &info) == 0) { 2277 return S_ISDIR(info.st_mode); 2278 } 2279 2280 size_t pos_slash = 1; // skip leading slashes for directory creation 2281 2282 // process path from front to back, procedurally creating directories 2283 while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) { 2284 const std::string subpath = path.substr(0, pos_slash); 2285 struct stat info; 2286 2287 // if the path already exists, ensure that it's a directory 2288 if (stat(subpath.c_str(), &info) == 0) { 2289 if (!S_ISDIR(info.st_mode)) { 2290 return false; 2291 } 2292 } else { 2293 // create parent directories 2294 const int ret = mkdir(subpath.c_str(), 0755); 2295 if (ret != 0) { 2296 return false; 2297 } 2298 } 2299 2300 pos_slash += 1; 2301 } 2302 2303 return true; 2304 #endif // _WIN32 2305 } 2306 2307 std::string fs_get_cache_directory() { 2308 std::string cache_directory = ""; 2309 auto ensure_trailing_slash = [](std::string p) { 2310 // Make sure to add trailing slash 2311 if (p.back() != DIRECTORY_SEPARATOR) { 2312 p += DIRECTORY_SEPARATOR; 2313 } 2314 return p; 2315 }; 2316 if (getenv("LLAMA_CACHE")) { 2317 cache_directory = std::getenv("LLAMA_CACHE"); 2318 } else { 2319 #ifdef __linux__ 2320 if (std::getenv("XDG_CACHE_HOME")) { 2321 cache_directory = std::getenv("XDG_CACHE_HOME"); 2322 } else { 2323 cache_directory = std::getenv("HOME") + std::string("/.cache/"); 2324 } 2325 #elif defined(__APPLE__) 2326 cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); 2327 #elif defined(_WIN32) 2328 cache_directory = std::getenv("LOCALAPPDATA"); 2329 #endif // __linux__ 2330 cache_directory = ensure_trailing_slash(cache_directory); 2331 cache_directory += "llama.cpp"; 2332 } 2333 return ensure_trailing_slash(cache_directory); 2334 } 2335 2336 std::string fs_get_cache_file(const std::string & filename) { 2337 GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos); 2338 std::string cache_directory = fs_get_cache_directory(); 2339 const bool success = fs_create_directory_with_parents(cache_directory); 2340 if (!success) { 2341 throw std::runtime_error("failed to create cache directory: " + cache_directory); 2342 } 2343 return cache_directory + filename; 2344 } 2345 2346 2347 // 2348 // Model utils 2349 // 2350 2351 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) { 2352 auto mparams = llama_model_params_from_gpt_params(params); 2353 2354 llama_model * model = nullptr; 2355 2356 if (!params.hf_repo.empty() && !params.hf_file.empty()) { 2357 model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); 2358 } else if (!params.model_url.empty()) { 2359 model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); 2360 } else { 2361 model = llama_load_model_from_file(params.model.c_str(), mparams); 2362 } 2363 2364 if (model == NULL) { 2365 fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); 2366 return std::make_tuple(nullptr, nullptr); 2367 } 2368 2369 auto cparams = llama_context_params_from_gpt_params(params); 2370 2371 llama_context * lctx = llama_new_context_with_model(model, cparams); 2372 if (lctx == NULL) { 2373 fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); 2374 llama_free_model(model); 2375 return std::make_tuple(nullptr, nullptr); 2376 } 2377 2378 if (!params.control_vectors.empty()) { 2379 if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; 2380 if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); 2381 2382 const auto cvec = llama_control_vector_load(params.control_vectors); 2383 if (cvec.n_embd == -1) { 2384 llama_free(lctx); 2385 llama_free_model(model); 2386 return std::make_tuple(nullptr, nullptr); 2387 } 2388 2389 int err = llama_control_vector_apply(lctx, 2390 cvec.data.data(), 2391 cvec.data.size(), 2392 cvec.n_embd, 2393 params.control_vector_layer_start, 2394 params.control_vector_layer_end); 2395 if (err) { 2396 llama_free(lctx); 2397 llama_free_model(model); 2398 return std::make_tuple(nullptr, nullptr); 2399 } 2400 } 2401 2402 for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { 2403 const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); 2404 float lora_scale = std::get<1>(params.lora_adapter[i]); 2405 int err = llama_model_apply_lora_from_file(model, 2406 lora_adapter.c_str(), 2407 lora_scale, 2408 ((i > 0) || params.lora_base.empty()) 2409 ? NULL 2410 : params.lora_base.c_str(), 2411 params.n_threads); 2412 if (err != 0) { 2413 fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); 2414 llama_free(lctx); 2415 llama_free_model(model); 2416 return std::make_tuple(nullptr, nullptr); 2417 } 2418 } 2419 2420 if (params.ignore_eos) { 2421 params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; 2422 } 2423 2424 if (params.warmup) { 2425 LOG("warming up the model with an empty run\n"); 2426 2427 std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), }; 2428 llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); 2429 llama_kv_cache_clear(lctx); 2430 llama_synchronize(lctx); 2431 llama_reset_timings(lctx); 2432 } 2433 2434 return std::make_tuple(model, lctx); 2435 } 2436 2437 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { 2438 auto mparams = llama_model_default_params(); 2439 2440 if (params.n_gpu_layers != -1) { 2441 mparams.n_gpu_layers = params.n_gpu_layers; 2442 } 2443 mparams.rpc_servers = params.rpc_servers.c_str(); 2444 mparams.main_gpu = params.main_gpu; 2445 mparams.split_mode = params.split_mode; 2446 mparams.tensor_split = params.tensor_split; 2447 mparams.use_mmap = params.use_mmap; 2448 mparams.use_mlock = params.use_mlock; 2449 mparams.check_tensors = params.check_tensors; 2450 if (params.kv_overrides.empty()) { 2451 mparams.kv_overrides = NULL; 2452 } else { 2453 GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key"); 2454 mparams.kv_overrides = params.kv_overrides.data(); 2455 } 2456 2457 return mparams; 2458 } 2459 2460 static ggml_type kv_cache_type_from_str(const std::string & s) { 2461 if (s == "f32") { 2462 return GGML_TYPE_F32; 2463 } 2464 if (s == "f16") { 2465 return GGML_TYPE_F16; 2466 } 2467 if (s == "q8_0") { 2468 return GGML_TYPE_Q8_0; 2469 } 2470 if (s == "q4_0") { 2471 return GGML_TYPE_Q4_0; 2472 } 2473 if (s == "q4_1") { 2474 return GGML_TYPE_Q4_1; 2475 } 2476 if (s == "iq4_nl") { 2477 return GGML_TYPE_IQ4_NL; 2478 } 2479 if (s == "q5_0") { 2480 return GGML_TYPE_Q5_0; 2481 } 2482 if (s == "q5_1") { 2483 return GGML_TYPE_Q5_1; 2484 } 2485 2486 throw std::runtime_error("Invalid cache type: " + s); 2487 } 2488 2489 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { 2490 auto cparams = llama_context_default_params(); 2491 2492 cparams.n_ctx = params.n_ctx; 2493 cparams.n_seq_max = params.n_parallel; 2494 cparams.n_batch = params.n_batch; 2495 cparams.n_ubatch = params.n_ubatch; 2496 cparams.n_threads = params.n_threads; 2497 cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; 2498 cparams.seed = params.seed; 2499 cparams.logits_all = params.logits_all; 2500 cparams.embeddings = params.embedding; 2501 cparams.rope_scaling_type = params.rope_scaling_type; 2502 cparams.rope_freq_base = params.rope_freq_base; 2503 cparams.rope_freq_scale = params.rope_freq_scale; 2504 cparams.yarn_ext_factor = params.yarn_ext_factor; 2505 cparams.yarn_attn_factor = params.yarn_attn_factor; 2506 cparams.yarn_beta_fast = params.yarn_beta_fast; 2507 cparams.yarn_beta_slow = params.yarn_beta_slow; 2508 cparams.yarn_orig_ctx = params.yarn_orig_ctx; 2509 cparams.pooling_type = params.pooling_type; 2510 cparams.defrag_thold = params.defrag_thold; 2511 cparams.cb_eval = params.cb_eval; 2512 cparams.cb_eval_user_data = params.cb_eval_user_data; 2513 cparams.offload_kqv = !params.no_kv_offload; 2514 cparams.flash_attn = params.flash_attn; 2515 2516 cparams.type_k = kv_cache_type_from_str(params.cache_type_k); 2517 cparams.type_v = kv_cache_type_from_str(params.cache_type_v); 2518 2519 return cparams; 2520 } 2521 2522 #ifdef LLAMA_USE_CURL 2523 2524 static bool starts_with(const std::string & str, const std::string & prefix) { 2525 // While we wait for C++20's std::string::starts_with... 2526 return str.rfind(prefix, 0) == 0; 2527 } 2528 2529 static bool llama_download_file(const std::string & url, const std::string & path) { 2530 2531 // Initialize libcurl 2532 std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); 2533 if (!curl) { 2534 fprintf(stderr, "%s: error initializing libcurl\n", __func__); 2535 return false; 2536 } 2537 2538 bool force_download = false; 2539 2540 // Set the URL, allow to follow http redirection 2541 curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); 2542 curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); 2543 2544 #if defined(_WIN32) 2545 // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of 2546 // operating system. Currently implemented under MS-Windows. 2547 curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); 2548 #endif 2549 2550 // Check if the file already exists locally 2551 struct stat model_file_info; 2552 auto file_exists = (stat(path.c_str(), &model_file_info) == 0); 2553 2554 // If the file exists, check its JSON metadata companion file. 2555 std::string metadata_path = path + ".json"; 2556 nlohmann::json metadata; 2557 std::string etag; 2558 std::string last_modified; 2559 2560 if (file_exists) { 2561 // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block). 2562 std::ifstream metadata_in(metadata_path); 2563 if (metadata_in.good()) { 2564 try { 2565 metadata_in >> metadata; 2566 fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); 2567 if (metadata.contains("url") && metadata.at("url").is_string()) { 2568 auto previous_url = metadata.at("url").get<std::string>(); 2569 if (previous_url != url) { 2570 fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); 2571 return false; 2572 } 2573 } 2574 if (metadata.contains("etag") && metadata.at("etag").is_string()) { 2575 etag = metadata.at("etag"); 2576 } 2577 if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) { 2578 last_modified = metadata.at("lastModified"); 2579 } 2580 } catch (const nlohmann::json::exception & e) { 2581 fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); 2582 return false; 2583 } 2584 } 2585 } else { 2586 fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str()); 2587 } 2588 2589 // Send a HEAD request to retrieve the etag and last-modified headers 2590 struct llama_load_model_from_url_headers { 2591 std::string etag; 2592 std::string last_modified; 2593 }; 2594 llama_load_model_from_url_headers headers; 2595 { 2596 typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); 2597 auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { 2598 llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; 2599 2600 static std::regex header_regex("([^:]+): (.*)\r\n"); 2601 static std::regex etag_regex("ETag", std::regex_constants::icase); 2602 static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase); 2603 2604 std::string header(buffer, n_items); 2605 std::smatch match; 2606 if (std::regex_match(header, match, header_regex)) { 2607 const std::string & key = match[1]; 2608 const std::string & value = match[2]; 2609 if (std::regex_match(key, match, etag_regex)) { 2610 headers->etag = value; 2611 } else if (std::regex_match(key, match, last_modified_regex)) { 2612 headers->last_modified = value; 2613 } 2614 } 2615 return n_items; 2616 }; 2617 2618 curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb 2619 curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress 2620 curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback)); 2621 curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); 2622 2623 CURLcode res = curl_easy_perform(curl.get()); 2624 if (res != CURLE_OK) { 2625 fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); 2626 return false; 2627 } 2628 2629 long http_code = 0; 2630 curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); 2631 if (http_code != 200) { 2632 // HEAD not supported, we don't know if the file has changed 2633 // force trigger downloading 2634 force_download = true; 2635 fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); 2636 } 2637 } 2638 2639 bool should_download = !file_exists || force_download; 2640 if (!should_download) { 2641 if (!etag.empty() && etag != headers.etag) { 2642 fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); 2643 should_download = true; 2644 } else if (!last_modified.empty() && last_modified != headers.last_modified) { 2645 fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); 2646 should_download = true; 2647 } 2648 } 2649 if (should_download) { 2650 std::string path_temporary = path + ".downloadInProgress"; 2651 if (file_exists) { 2652 fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); 2653 if (remove(path.c_str()) != 0) { 2654 fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str()); 2655 return false; 2656 } 2657 } 2658 2659 // Set the output file 2660 std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose); 2661 if (!outfile) { 2662 fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); 2663 return false; 2664 } 2665 2666 typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); 2667 auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { 2668 return fwrite(data, size, nmemb, (FILE *)fd); 2669 }; 2670 curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L); 2671 curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback)); 2672 curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get()); 2673 2674 // display download progress 2675 curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); 2676 2677 // helper function to hide password in URL 2678 auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { 2679 std::size_t protocol_pos = url.find("://"); 2680 if (protocol_pos == std::string::npos) { 2681 return url; // Malformed URL 2682 } 2683 2684 std::size_t at_pos = url.find('@', protocol_pos + 3); 2685 if (at_pos == std::string::npos) { 2686 return url; // No password in URL 2687 } 2688 2689 return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); 2690 }; 2691 2692 // start the download 2693 fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, 2694 llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); 2695 auto res = curl_easy_perform(curl.get()); 2696 if (res != CURLE_OK) { 2697 fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); 2698 return false; 2699 } 2700 2701 long http_code = 0; 2702 curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); 2703 if (http_code < 200 || http_code >= 400) { 2704 fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); 2705 return false; 2706 } 2707 2708 // Causes file to be closed explicitly here before we rename it. 2709 outfile.reset(); 2710 2711 // Write the updated JSON metadata file. 2712 metadata.update({ 2713 {"url", url}, 2714 {"etag", headers.etag}, 2715 {"lastModified", headers.last_modified} 2716 }); 2717 std::ofstream(metadata_path) << metadata.dump(4); 2718 fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); 2719 2720 if (rename(path_temporary.c_str(), path.c_str()) != 0) { 2721 fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); 2722 return false; 2723 } 2724 } 2725 2726 return true; 2727 } 2728 2729 struct llama_model * llama_load_model_from_url( 2730 const char * model_url, 2731 const char * path_model, 2732 const struct llama_model_params & params) { 2733 // Basic validation of the model_url 2734 if (!model_url || strlen(model_url) == 0) { 2735 fprintf(stderr, "%s: invalid model_url\n", __func__); 2736 return NULL; 2737 } 2738 2739 if (!llama_download_file(model_url, path_model)) { 2740 return NULL; 2741 } 2742 2743 // check for additional GGUFs split to download 2744 int n_split = 0; 2745 { 2746 struct gguf_init_params gguf_params = { 2747 /*.no_alloc = */ true, 2748 /*.ctx = */ NULL, 2749 }; 2750 auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); 2751 if (!ctx_gguf) { 2752 fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); 2753 return NULL; 2754 } 2755 2756 auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); 2757 if (key_n_split >= 0) { 2758 n_split = gguf_get_val_u16(ctx_gguf, key_n_split); 2759 } 2760 2761 gguf_free(ctx_gguf); 2762 } 2763 2764 if (n_split > 1) { 2765 char split_prefix[PATH_MAX] = {0}; 2766 char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; 2767 2768 // Verify the first split file format 2769 // and extract split URL and PATH prefixes 2770 { 2771 if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { 2772 fprintf(stderr, "\n%s: unexpected model file name: %s" 2773 " n_split=%d\n", __func__, path_model, n_split); 2774 return NULL; 2775 } 2776 2777 if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { 2778 fprintf(stderr, "\n%s: unexpected model url: %s" 2779 " n_split=%d\n", __func__, model_url, n_split); 2780 return NULL; 2781 } 2782 } 2783 2784 // Prepare download in parallel 2785 std::vector<std::future<bool>> futures_download; 2786 for (int idx = 1; idx < n_split; idx++) { 2787 futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool { 2788 char split_path[PATH_MAX] = {0}; 2789 llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); 2790 2791 char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; 2792 llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); 2793 2794 return llama_download_file(split_url, split_path); 2795 }, idx)); 2796 } 2797 2798 // Wait for all downloads to complete 2799 for (auto & f : futures_download) { 2800 if (!f.get()) { 2801 return NULL; 2802 } 2803 } 2804 } 2805 2806 return llama_load_model_from_file(path_model, params); 2807 } 2808 2809 struct llama_model * llama_load_model_from_hf( 2810 const char * repo, 2811 const char * model, 2812 const char * path_model, 2813 const struct llama_model_params & params) { 2814 // construct hugging face model url: 2815 // 2816 // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf 2817 // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf 2818 // 2819 // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf 2820 // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf 2821 // 2822 2823 std::string model_url = "https://huggingface.co/"; 2824 model_url += repo; 2825 model_url += "/resolve/main/"; 2826 model_url += model; 2827 2828 return llama_load_model_from_url(model_url.c_str(), path_model, params); 2829 } 2830 2831 #else 2832 2833 struct llama_model * llama_load_model_from_url( 2834 const char * /*model_url*/, 2835 const char * /*path_model*/, 2836 const struct llama_model_params & /*params*/) { 2837 fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); 2838 return nullptr; 2839 } 2840 2841 struct llama_model * llama_load_model_from_hf( 2842 const char * /*repo*/, 2843 const char * /*model*/, 2844 const char * /*path_model*/, 2845 const struct llama_model_params & /*params*/) { 2846 fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); 2847 return nullptr; 2848 } 2849 2850 #endif // LLAMA_USE_CURL 2851 2852 // 2853 // Batch utils 2854 // 2855 2856 void llama_batch_clear(struct llama_batch & batch) { 2857 batch.n_tokens = 0; 2858 } 2859 2860 void llama_batch_add( 2861 struct llama_batch & batch, 2862 llama_token id, 2863 llama_pos pos, 2864 const std::vector<llama_seq_id> & seq_ids, 2865 bool logits) { 2866 batch.token [batch.n_tokens] = id; 2867 batch.pos [batch.n_tokens] = pos; 2868 batch.n_seq_id[batch.n_tokens] = seq_ids.size(); 2869 for (size_t i = 0; i < seq_ids.size(); ++i) { 2870 batch.seq_id[batch.n_tokens][i] = seq_ids[i]; 2871 } 2872 batch.logits [batch.n_tokens] = logits; 2873 2874 batch.n_tokens++; 2875 } 2876 2877 // 2878 // Vocab utils 2879 // 2880 2881 std::vector<llama_token> llama_tokenize( 2882 const struct llama_context * ctx, 2883 const std::string & text, 2884 bool add_special, 2885 bool parse_special) { 2886 return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); 2887 } 2888 2889 std::vector<llama_token> llama_tokenize( 2890 const struct llama_model * model, 2891 const std::string & text, 2892 bool add_special, 2893 bool parse_special) { 2894 // upper limit for the number of tokens 2895 int n_tokens = text.length() + 2 * add_special; 2896 std::vector<llama_token> result(n_tokens); 2897 n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); 2898 if (n_tokens < 0) { 2899 result.resize(-n_tokens); 2900 int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); 2901 GGML_ASSERT(check == -n_tokens); 2902 } else { 2903 result.resize(n_tokens); 2904 } 2905 return result; 2906 } 2907 2908 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { 2909 std::vector<char> result(8, 0); 2910 const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); 2911 if (n_tokens < 0) { 2912 result.resize(-n_tokens); 2913 int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); 2914 GGML_ASSERT(check == -n_tokens); 2915 } else { 2916 result.resize(n_tokens); 2917 } 2918 2919 return std::string(result.data(), result.size()); 2920 } 2921 2922 std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) { 2923 const llama_token bos_id = llama_token_bos(llama_get_model(ctx)); 2924 2925 std::string piece; 2926 std::string result; 2927 2928 for (size_t i = 0; i < tokens.size(); ++i) { 2929 piece = llama_token_to_piece(ctx, tokens[i]); 2930 2931 // remove the leading space of the first non-BOS token 2932 if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { 2933 piece = piece.substr(1); 2934 } 2935 2936 result += piece; 2937 } 2938 2939 return result; 2940 } 2941 2942 std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) { 2943 std::string piece; 2944 std::string result; 2945 2946 for (size_t i = 0; i < tokens.size(); ++i) { 2947 piece = llama_token_to_piece(ctx, tokens[i]); 2948 2949 result += piece; 2950 } 2951 2952 // NOTE: the original tokenizer decodes bytes after collecting the pieces. 2953 return result; 2954 } 2955 2956 bool llama_should_add_bos_token(const llama_model * model) { 2957 const int add_bos = llama_add_bos_token(model); 2958 2959 return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM); 2960 } 2961 2962 bool llama_chat_verify_template(const std::string & tmpl) { 2963 llama_chat_message chat[] = {{"user", "test"}}; 2964 int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); 2965 return res >= 0; 2966 } 2967 2968 // 2969 // KV cache utils 2970 // 2971 2972 void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) { 2973 static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; 2974 2975 printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", 2976 view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); 2977 2978 llama_kv_cache_view_cell * c_curr = view.cells; 2979 llama_seq_id * cs_curr = view.cells_sequences; 2980 2981 for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { 2982 if (i % row_size == 0) { 2983 printf("\n%5d: ", i); 2984 } 2985 int seq_count = 0; 2986 for (int j = 0; j < view.n_seq_max; j++) { 2987 if (cs_curr[j] >= 0) { seq_count++; } 2988 } 2989 putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]); 2990 } 2991 2992 printf("\n=== Done dumping\n"); 2993 } 2994 2995 void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) { 2996 static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; 2997 2998 printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", 2999 view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); 3000 3001 std::unordered_map<llama_seq_id, size_t> seqs; 3002 llama_kv_cache_view_cell * c_curr = view.cells; 3003 llama_seq_id * cs_curr = view.cells_sequences; 3004 3005 for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { 3006 for (int j = 0; j < view.n_seq_max; j++) { 3007 if (cs_curr[j] < 0) { continue; } 3008 if (seqs.find(cs_curr[j]) == seqs.end()) { 3009 if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } 3010 const size_t sz = seqs.size(); 3011 seqs[cs_curr[j]] = sz; 3012 } 3013 } 3014 if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } 3015 } 3016 3017 printf("=== Sequence legend: "); 3018 for (const auto & it : seqs) { 3019 printf("%zu=%d, ", it.second, it.first); 3020 } 3021 printf("'+'=other sequence ids"); 3022 3023 c_curr = view.cells; 3024 cs_curr = view.cells_sequences; 3025 for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { 3026 if (i % row_size == 0) { 3027 printf("\n%5d: ", i); 3028 } 3029 for (int j = 0; j < view.n_seq_max; j++) { 3030 if (cs_curr[j] >= 0) { 3031 const auto & it = seqs.find(cs_curr[j]); 3032 putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+'); 3033 } else { 3034 putchar('.'); 3035 } 3036 } 3037 putchar(' '); 3038 } 3039 3040 printf("\n=== Done dumping\n"); 3041 } 3042 3043 // 3044 // Embedding utils 3045 // 3046 3047 void llama_embd_normalize(const float * inp, float * out, int n) { 3048 double sum = 0.0; 3049 for (int i = 0; i < n; i++) { 3050 sum += inp[i] * inp[i]; 3051 } 3052 sum = sqrt(sum); 3053 3054 const float norm = sum > 0.0 ? 1.0f / sum : 0.0f; 3055 3056 for (int i = 0; i < n; i++) { 3057 out[i] = inp[i] * norm; 3058 } 3059 } 3060 3061 float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ 3062 double sum = 0.0; 3063 double sum1 = 0.0; 3064 double sum2 = 0.0; 3065 3066 for (int i = 0; i < n; i++) { 3067 sum += embd1[i] * embd2[i]; 3068 sum1 += embd1[i] * embd1[i]; 3069 sum2 += embd2[i] * embd2[i]; 3070 } 3071 3072 return sum / (sqrt(sum1) * sqrt(sum2)); 3073 } 3074 3075 // 3076 // Control vector utils 3077 // 3078 3079 static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { 3080 int32_t n_tensors; 3081 3082 size_t n_bytes = 0; 3083 3084 uint32_t max_direction_layer = 0; 3085 3086 llama_control_vector_data result = { -1, {} }; 3087 3088 // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer 3089 { 3090 struct ggml_init_params meta_params = { 3091 /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), 3092 /* .mem_buffer = */ nullptr, 3093 /* .no_alloc = */ true, 3094 }; 3095 ggml_context * meta_ctx = ggml_init(meta_params); 3096 struct gguf_init_params meta_gguf_params = { 3097 /* .no_alloc = */ true, 3098 /* .ctx = */ &meta_ctx, 3099 }; 3100 struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); 3101 if (!meta_ctx_gguf) { 3102 fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); 3103 ggml_free(meta_ctx); 3104 return result; 3105 } 3106 3107 n_tensors = gguf_get_n_tensors(meta_ctx_gguf); 3108 for (int i = 0; i < n_tensors; i++) { 3109 std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); 3110 3111 // split on '.' 3112 size_t dotpos = name.find('.'); 3113 if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { 3114 try { 3115 uint32_t layer = std::stoi(name.substr(dotpos + 1)); 3116 if (layer == 0) { 3117 fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); 3118 ggml_free(meta_ctx); 3119 gguf_free(meta_ctx_gguf); 3120 return result; 3121 } 3122 if (layer > max_direction_layer) { 3123 max_direction_layer = layer; 3124 } 3125 } catch (...) { 3126 fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); 3127 ggml_free(meta_ctx); 3128 gguf_free(meta_ctx_gguf); 3129 return result; 3130 } 3131 } 3132 3133 struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); 3134 if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { 3135 fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); 3136 ggml_free(meta_ctx); 3137 gguf_free(meta_ctx_gguf); 3138 return result; 3139 } 3140 if (result.n_embd == -1) { 3141 result.n_embd = ggml_nelements(tensor_meta); 3142 } else if (ggml_nelements(tensor_meta) != result.n_embd) { 3143 fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); 3144 ggml_free(meta_ctx); 3145 gguf_free(meta_ctx_gguf); 3146 return result; 3147 } 3148 n_bytes += ggml_nbytes(tensor_meta); 3149 } 3150 ggml_free(meta_ctx); 3151 gguf_free(meta_ctx_gguf); 3152 } 3153 3154 if (n_tensors == 0) { 3155 fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); 3156 return result; 3157 } 3158 3159 // load and scale tensors into final control vector context 3160 struct ggml_init_params ggml_params = { 3161 /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, 3162 /* .mem_buffer = */ nullptr, 3163 /* .no_alloc = */ false, 3164 }; 3165 struct ggml_context * ctx = ggml_init(ggml_params); 3166 3167 struct gguf_init_params params = { 3168 /*.no_alloc = */ false, 3169 /*.ctx = */ &ctx, 3170 }; 3171 struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); 3172 if (!ctx_gguf) { 3173 fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); 3174 ggml_free(ctx); 3175 return result; 3176 } 3177 3178 // do not store data for layer 0 (it's not used) 3179 result.data.resize(result.n_embd * max_direction_layer); 3180 3181 for (uint32_t il = 1; il <= max_direction_layer; il++) { 3182 const std::string name = "direction." + std::to_string(il); 3183 const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); 3184 3185 float * dst = result.data.data() + result.n_embd * (il - 1); 3186 3187 if (tensor) { 3188 const float * src = (const float *) tensor->data; 3189 for (int j = 0; j < result.n_embd; j++) { 3190 dst[j] = src[j] * load_info.strength; 3191 } 3192 } else { 3193 for (int j = 0; j < result.n_embd; j++) { 3194 dst[j] = 0.0f; 3195 } 3196 } 3197 } 3198 3199 return result; 3200 } 3201 3202 llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) { 3203 llama_control_vector_data result = { -1, {} }; 3204 3205 for (const auto & info : load_infos) { 3206 auto cur = llama_control_vector_load_one(info); 3207 3208 if (cur.n_embd == -1) { 3209 return result; 3210 } 3211 if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { 3212 fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); 3213 return result; 3214 } 3215 3216 if (result.n_embd == -1) { 3217 result = std::move(cur); 3218 } else { 3219 for (size_t i = 0; i < cur.data.size(); i++) { 3220 result.data[i] += cur.data[i]; 3221 } 3222 } 3223 } 3224 3225 if (result.n_embd == -1) { 3226 fprintf(stderr, "%s: no vectors passed\n", __func__); 3227 } 3228 3229 return result; 3230 } 3231 3232 // 3233 // YAML utils 3234 // 3235 3236 void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) { 3237 if (data.empty()) { 3238 fprintf(stream, "%s:\n", prop_name); 3239 return; 3240 } 3241 3242 fprintf(stream, "%s: [", prop_name); 3243 for (size_t i = 0; i < data.size() - 1; ++i) { 3244 fprintf(stream, "%e, ", data[i]); 3245 } 3246 fprintf(stream, "%e]\n", data.back()); 3247 } 3248 3249 void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) { 3250 if (data.empty()) { 3251 fprintf(stream, "%s:\n", prop_name); 3252 return; 3253 } 3254 3255 fprintf(stream, "%s: [", prop_name); 3256 for (size_t i = 0; i < data.size() - 1; ++i) { 3257 fprintf(stream, "%d, ", data[i]); 3258 } 3259 fprintf(stream, "%d]\n", data.back()); 3260 } 3261 3262 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) { 3263 std::string data_str(data == NULL ? "" : data); 3264 3265 if (data_str.empty()) { 3266 fprintf(stream, "%s:\n", prop_name); 3267 return; 3268 } 3269 3270 size_t pos_start = 0; 3271 size_t pos_found = 0; 3272 3273 if (std::isspace(data_str[0]) || std::isspace(data_str.back())) { 3274 data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); 3275 data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); 3276 data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)"); 3277 data_str = "\"" + data_str + "\""; 3278 fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); 3279 return; 3280 } 3281 3282 if (data_str.find('\n') == std::string::npos) { 3283 fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); 3284 return; 3285 } 3286 3287 fprintf(stream, "%s: |\n", prop_name); 3288 while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { 3289 fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); 3290 pos_start = pos_found + 1; 3291 } 3292 } 3293 3294 void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx, 3295 const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) { 3296 const llama_sampling_params & sparams = params.sparams; 3297 3298 fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT); 3299 fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER); 3300 fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); 3301 fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); 3302 fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false"); 3303 fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); 3304 fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); 3305 fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); 3306 fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); 3307 fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); 3308 fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); 3309 fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); 3310 fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); 3311 fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); 3312 fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); 3313 fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); 3314 fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); 3315 fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); 3316 fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); 3317 fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); 3318 fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); 3319 fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); 3320 fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); 3321 3322 #ifdef NDEBUG 3323 fprintf(stream, "debug: false\n"); 3324 #else 3325 fprintf(stream, "debug: true\n"); 3326 #endif // NDEBUG 3327 3328 fprintf(stream, "model_desc: %s\n", model_desc); 3329 fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx))); 3330 3331 #ifdef __OPTIMIZE__ 3332 fprintf(stream, "optimize: true\n"); 3333 #else 3334 fprintf(stream, "optimize: false\n"); 3335 #endif // __OPTIMIZE__ 3336 3337 fprintf(stream, "time: %s\n", timestamp.c_str()); 3338 3339 fprintf(stream, "\n"); 3340 fprintf(stream, "###############\n"); 3341 fprintf(stream, "# User Inputs #\n"); 3342 fprintf(stream, "###############\n"); 3343 fprintf(stream, "\n"); 3344 3345 fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); 3346 fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); 3347 yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str()); 3348 fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); 3349 fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); 3350 fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); 3351 fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); 3352 fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); 3353 fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); 3354 fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq); 3355 yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str()); 3356 fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); 3357 fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); 3358 fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); 3359 3360 const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); 3361 const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; 3362 fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); 3363 3364 yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); 3365 fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); 3366 yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); 3367 fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); 3368 fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); 3369 fprintf(stream, "keep: %d # default: 0\n", params.n_keep); 3370 fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); 3371 3372 fprintf(stream, "logit_bias:\n"); 3373 for (std::pair<llama_token, float> lb : sparams.logit_bias) { 3374 if (ignore_eos && lb.first == logit_bias_eos->first) { 3375 continue; 3376 } 3377 fprintf(stream, " %d: %f", lb.first, lb.second); 3378 } 3379 3380 fprintf(stream, "lora:\n"); 3381 for (std::tuple<std::string, float> la : params.lora_adapter) { 3382 if (std::get<1>(la) != 1.0f) { 3383 continue; 3384 } 3385 fprintf(stream, " - %s\n", std::get<0>(la).c_str()); 3386 } 3387 fprintf(stream, "lora_scaled:\n"); 3388 for (std::tuple<std::string, float> la : params.lora_adapter) { 3389 if (std::get<1>(la) == 1.0f) { 3390 continue; 3391 } 3392 fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); 3393 } 3394 fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); 3395 fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); 3396 fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); 3397 fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); 3398 fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); 3399 fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); 3400 fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); 3401 fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); 3402 fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); 3403 fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); 3404 fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); 3405 fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); 3406 fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); 3407 fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); 3408 fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); 3409 fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); 3410 fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); 3411 fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); 3412 yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str()); 3413 fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); 3414 fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); 3415 fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); 3416 yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); 3417 fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); 3418 3419 fprintf(stream, "reverse_prompt:\n"); 3420 for (std::string ap : params.antiprompt) { 3421 size_t pos = 0; 3422 while ((pos = ap.find('\n', pos)) != std::string::npos) { 3423 ap.replace(pos, 1, "\\n"); 3424 pos += 1; 3425 } 3426 3427 fprintf(stream, " - %s\n", ap.c_str()); 3428 } 3429 3430 fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); 3431 fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); 3432 fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); 3433 fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); 3434 fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); 3435 fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); 3436 fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); 3437 3438 const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); 3439 yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector); 3440 3441 fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); 3442 fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); 3443 fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); 3444 fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); 3445 fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); 3446 fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); 3447 fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); 3448 fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); 3449 }