gguf-split.cpp
1 #include "llama.h" 2 #include "common.h" 3 4 #include <algorithm> 5 #include <cmath> 6 #include <cstdlib> 7 #include <fstream> 8 #include <string> 9 #include <vector> 10 11 #include <stdio.h> 12 #include <string.h> 13 #include <climits> 14 #include <stdexcept> 15 16 #if defined(_WIN32) 17 #include <windows.h> 18 #ifndef PATH_MAX 19 #define PATH_MAX MAX_PATH 20 #endif 21 #include <io.h> 22 #endif 23 24 enum split_operation : uint8_t { 25 SPLIT_OP_SPLIT, 26 SPLIT_OP_MERGE, 27 }; 28 29 struct split_params { 30 split_operation operation = SPLIT_OP_SPLIT; 31 size_t n_bytes_split = 0; 32 int n_split_tensors = 128; 33 std::string input; 34 std::string output; 35 bool no_tensor_first_split = false; 36 bool dry_run = false; 37 }; 38 39 static void split_print_usage(const char * executable) { 40 const split_params default_params; 41 printf("\n"); 42 printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable); 43 printf("\n"); 44 printf("Apply a GGUF operation on IN to OUT."); 45 printf("\n"); 46 printf("options:\n"); 47 printf(" -h, --help show this help message and exit\n"); 48 printf(" --version show version and build info\n"); 49 printf(" --split split GGUF to multiple GGUF (enabled by default)\n"); 50 printf(" --merge merge multiple GGUF to a single GGUF\n"); 51 printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); 52 printf(" --split-max-size N(M|G) max size per split\n"); 53 printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n"); 54 printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); 55 printf("\n"); 56 } 57 58 // return convert string, for example "128M" or "4G" to number of bytes 59 static size_t split_str_to_n_bytes(std::string str) { 60 size_t n_bytes = 0; 61 int n; 62 if (str.back() == 'M') { 63 sscanf(str.c_str(), "%d", &n); 64 n_bytes = (size_t)n * 1000 * 1000; // megabytes 65 } else if (str.back() == 'G') { 66 sscanf(str.c_str(), "%d", &n); 67 n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes 68 } else { 69 throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); 70 } 71 if (n <= 0) { 72 throw std::invalid_argument("error: size must be a positive value"); 73 } 74 return n_bytes; 75 } 76 77 static void split_params_parse_ex(int argc, const char ** argv, split_params & params) { 78 std::string arg; 79 const std::string arg_prefix = "--"; 80 bool invalid_param = false; 81 82 int arg_idx = 1; 83 for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { 84 arg = argv[arg_idx]; 85 if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { 86 std::replace(arg.begin(), arg.end(), '_', '-'); 87 } 88 89 bool arg_found = false; 90 bool is_op_set = false; 91 bool is_mode_set = false; 92 if (arg == "-h" || arg == "--help") { 93 split_print_usage(argv[0]); 94 exit(0); 95 } 96 if (arg == "--version") { 97 fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); 98 fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); 99 exit(0); 100 } 101 if (arg == "--dry-run") { 102 arg_found = true; 103 params.dry_run = true; 104 } 105 if (arg == "--no-tensor-first-split") { 106 arg_found = true; 107 params.no_tensor_first_split = true; 108 } 109 110 if (is_op_set) { 111 throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); 112 } 113 if (arg == "--merge") { 114 arg_found = true; 115 is_op_set = true; 116 params.operation = SPLIT_OP_MERGE; 117 } 118 if (arg == "--split") { 119 arg_found = true; 120 is_op_set = true; 121 params.operation = SPLIT_OP_SPLIT; 122 } 123 124 if (is_mode_set) { 125 throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); 126 } 127 if (arg == "--split-max-tensors") { 128 if (++arg_idx >= argc) { 129 invalid_param = true; 130 break; 131 } 132 arg_found = true; 133 is_mode_set = true; 134 params.n_split_tensors = atoi(argv[arg_idx]); 135 } 136 if (arg == "--split-max-size") { 137 if (++arg_idx >= argc) { 138 invalid_param = true; 139 break; 140 } 141 arg_found = true; 142 is_mode_set = true; 143 params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); 144 } 145 146 if (!arg_found) { 147 throw std::invalid_argument("error: unknown argument: " + arg); 148 } 149 } 150 151 if (invalid_param) { 152 throw std::invalid_argument("error: invalid parameter for argument: " + arg); 153 } 154 155 if (argc - arg_idx < 2) { 156 throw std::invalid_argument("error: bad arguments"); 157 } 158 159 params.input = argv[arg_idx++]; 160 params.output = argv[arg_idx++]; 161 } 162 163 static bool split_params_parse(int argc, const char ** argv, split_params & params) { 164 bool result = true; 165 try { 166 split_params_parse_ex(argc, argv, params); 167 } 168 catch (const std::invalid_argument & ex) { 169 fprintf(stderr, "%s\n", ex.what()); 170 split_print_usage(argv[0]); 171 exit(EXIT_FAILURE); 172 } 173 return result; 174 } 175 176 static void zeros(std::ofstream & file, size_t n) { 177 char zero = 0; 178 for (size_t i = 0; i < n; ++i) { 179 file.write(&zero, 1); 180 } 181 } 182 183 struct split_strategy { 184 const split_params params; 185 std::ifstream & f_input; 186 struct gguf_context * ctx_gguf; 187 struct ggml_context * ctx_meta = NULL; 188 const int n_tensors; 189 190 // one ctx_out per one output file 191 std::vector<struct gguf_context *> ctx_outs; 192 193 // temporary buffer for reading in tensor data 194 std::vector<uint8_t> read_buf; 195 196 split_strategy(const split_params & params, 197 std::ifstream & f_input, 198 struct gguf_context * ctx_gguf, 199 struct ggml_context * ctx_meta) : 200 params(params), 201 f_input(f_input), 202 ctx_gguf(ctx_gguf), 203 ctx_meta(ctx_meta), 204 n_tensors(gguf_get_n_tensors(ctx_gguf)) { 205 206 // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits 207 int i_split = -1; 208 struct gguf_context * ctx_out = NULL; 209 auto new_ctx_out = [&](bool allow_no_tensors) { 210 i_split++; 211 if (ctx_out != NULL) { 212 if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) { 213 fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); 214 exit(EXIT_FAILURE); 215 } 216 ctx_outs.push_back(ctx_out); 217 } 218 ctx_out = gguf_init_empty(); 219 // Save all metadata in first split only 220 if (i_split == 0) { 221 gguf_set_kv(ctx_out, ctx_gguf); 222 } 223 gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split); 224 gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder 225 gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors); 226 }; 227 228 // initialize ctx_out for the first split 229 new_ctx_out(false); 230 231 // skip first split if no_tensor_first_split is set 232 if (params.no_tensor_first_split) { 233 new_ctx_out(true); 234 } 235 236 // process tensors one by one 237 size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) 238 for (int i = 0; i < n_tensors; ++i) { 239 struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); 240 // calculate the "imaginary" size = the current size + next tensor size 241 size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); 242 size_t next_tensors_size = curr_tensors_size + n_bytes; 243 if (should_split(i, next_tensors_size)) { 244 new_ctx_out(false); 245 curr_tensors_size = n_bytes; 246 } else { 247 curr_tensors_size = next_tensors_size; 248 } 249 gguf_add_tensor(ctx_out, t); 250 } 251 252 // push the last ctx_out 253 ctx_outs.push_back(ctx_out); 254 255 // set the correct n_split for all ctx_out 256 for (auto & ctx : ctx_outs) { 257 gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size()); 258 } 259 } 260 261 ~split_strategy() { 262 for (auto & ctx_out : ctx_outs) { 263 gguf_free(ctx_out); 264 } 265 } 266 267 bool should_split(int i_tensor, size_t next_size) { 268 if (params.n_bytes_split > 0) { 269 // split by max size per file 270 return next_size > params.n_bytes_split; 271 } else { 272 // split by number of tensors per file 273 return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; 274 } 275 } 276 277 void print_info() { 278 printf("n_split: %ld\n", ctx_outs.size()); 279 int i_split = 0; 280 for (auto & ctx_out : ctx_outs) { 281 // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) 282 size_t total_size = gguf_get_meta_size(ctx_out); 283 for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { 284 struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i)); 285 total_size += ggml_nbytes(t); 286 } 287 total_size = total_size / 1000 / 1000; // convert to megabytes 288 printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); 289 i_split++; 290 } 291 } 292 293 void write() { 294 int i_split = 0; 295 int n_split = ctx_outs.size(); 296 for (auto & ctx_out : ctx_outs) { 297 // construct file path 298 char split_path[PATH_MAX] = {0}; 299 llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); 300 301 // open the output file 302 printf("Writing file %s ... ", split_path); 303 fflush(stdout); 304 std::ofstream fout = std::ofstream(split_path, std::ios::binary); 305 fout.exceptions(std::ofstream::failbit); // fail fast on write errors 306 307 // write metadata 308 std::vector<uint8_t> data(gguf_get_meta_size(ctx_out)); 309 gguf_get_meta_data(ctx_out, data.data()); 310 fout.write((const char *)data.data(), data.size()); 311 312 // write tensors 313 for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { 314 // read tensor meta and prepare buffer 315 const char * t_name = gguf_get_tensor_name(ctx_out, i); 316 struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); 317 auto n_bytes = ggml_nbytes(t); 318 read_buf.resize(n_bytes); 319 320 // calculate offset 321 auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file 322 auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); 323 324 // copy tensor from input to output file 325 copy_file_to_file(f_input, fout, offset, n_bytes); 326 zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); 327 } 328 329 printf("done\n"); 330 // close the file 331 fout.close(); 332 i_split++; 333 } 334 } 335 336 void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) { 337 // TODO: detect OS and use copy_file_range() here for better performance 338 if (read_buf.size() < len) { 339 read_buf.resize(len); 340 } 341 f_in.seekg(in_offset); 342 f_in.read((char *)read_buf.data(), len); 343 f_out.write((const char *)read_buf.data(), len); 344 } 345 }; 346 347 static void gguf_split(const split_params & split_params) { 348 struct ggml_context * ctx_meta = NULL; 349 350 struct gguf_init_params params = { 351 /*.no_alloc = */ true, 352 /*.ctx = */ &ctx_meta, 353 }; 354 355 std::ifstream f_input(split_params.input.c_str(), std::ios::binary); 356 if (!f_input.is_open()) { 357 fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); 358 exit(EXIT_FAILURE); 359 } 360 361 auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); 362 if (!ctx_gguf) { 363 fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); 364 exit(EXIT_FAILURE); 365 } 366 367 // prepare the strategy 368 split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); 369 int n_split = strategy.ctx_outs.size(); 370 strategy.print_info(); 371 372 if (!split_params.dry_run) { 373 // write all output splits 374 strategy.write(); 375 } 376 377 // done, clean up 378 gguf_free(ctx_gguf); 379 f_input.close(); 380 381 fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", 382 __func__, n_split, strategy.n_tensors); 383 } 384 385 static void gguf_merge(const split_params & split_params) { 386 fprintf(stderr, "%s: %s -> %s\n", 387 __func__, split_params.input.c_str(), 388 split_params.output.c_str()); 389 int n_split = 1; 390 int total_tensors = 0; 391 392 auto * ctx_out = gguf_init_empty(); 393 std::ofstream fout(split_params.output.c_str(), std::ios::binary); 394 fout.exceptions(std::ofstream::failbit); // fail fast on write errors 395 396 std::vector<uint8_t> read_data; 397 std::vector<ggml_context *> ctx_metas; 398 std::vector<gguf_context *> ctx_ggufs; 399 400 char split_path[PATH_MAX] = {0}; 401 strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1); 402 char split_prefix[PATH_MAX] = {0}; 403 404 // First pass to find KV and tensors metadata 405 for (int i_split = 0; i_split < n_split; i_split++) { 406 struct ggml_context * ctx_meta = NULL; 407 408 struct gguf_init_params params = { 409 /*.no_alloc = */ true, 410 /*.ctx = */ &ctx_meta, 411 }; 412 413 if (i_split > 0) { 414 llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); 415 } 416 fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path); 417 418 auto * ctx_gguf = gguf_init_from_file(split_path, params); 419 if (!ctx_gguf) { 420 fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); 421 exit(EXIT_FAILURE); 422 } 423 ctx_ggufs.push_back(ctx_gguf); 424 ctx_metas.push_back(ctx_meta); 425 426 if (i_split == 0) { 427 auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); 428 if (key_n_split < 0) { 429 fprintf(stderr, 430 "\n%s: input file does not contain %s metadata\n", 431 __func__, 432 LLM_KV_SPLIT_COUNT); 433 gguf_free(ctx_gguf); 434 ggml_free(ctx_meta); 435 gguf_free(ctx_out); 436 fout.close(); 437 exit(EXIT_FAILURE); 438 } 439 440 n_split = gguf_get_val_u16(ctx_gguf, key_n_split); 441 if (n_split < 1) { 442 fprintf(stderr, 443 "\n%s: input file does not contain a valid split count %d\n", 444 __func__, 445 n_split); 446 gguf_free(ctx_gguf); 447 ggml_free(ctx_meta); 448 gguf_free(ctx_out); 449 fout.close(); 450 exit(EXIT_FAILURE); 451 } 452 453 // Verify the file naming and extract split_prefix 454 if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { 455 fprintf(stderr, "\n%s: unexpected input file name: %s" 456 " i_split=%d" 457 " n_split=%d\n", __func__, 458 split_path, i_split, n_split); 459 gguf_free(ctx_gguf); 460 ggml_free(ctx_meta); 461 gguf_free(ctx_out); 462 fout.close(); 463 exit(EXIT_FAILURE); 464 } 465 466 // Do not trigger merge if we try to merge again the output 467 gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0); 468 469 // Set metadata from the first split 470 gguf_set_kv(ctx_out, ctx_gguf); 471 } 472 473 auto n_tensors = gguf_get_n_tensors(ctx_gguf); 474 for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { 475 const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); 476 struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); 477 gguf_add_tensor(ctx_out, t); 478 } 479 total_tensors += n_tensors; 480 481 fprintf(stderr, "\033[3Ddone\n"); 482 } 483 484 // placeholder for the meta data 485 { 486 auto meta_size = gguf_get_meta_size(ctx_out); 487 ::zeros(fout, meta_size); 488 } 489 490 // Write tensors data 491 for (int i_split = 0; i_split < n_split; i_split++) { 492 llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); 493 std::ifstream f_input(split_path, std::ios::binary); 494 if (!f_input.is_open()) { 495 fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path); 496 for (uint32_t i = 0; i < ctx_ggufs.size(); i++) { 497 gguf_free(ctx_ggufs[i]); 498 ggml_free(ctx_metas[i]); 499 } 500 gguf_free(ctx_out); 501 fout.close(); 502 exit(EXIT_FAILURE); 503 } 504 fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path); 505 506 auto * ctx_gguf = ctx_ggufs[i_split]; 507 auto * ctx_meta = ctx_metas[i_split]; 508 509 auto n_tensors = gguf_get_n_tensors(ctx_gguf); 510 for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { 511 const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); 512 struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); 513 514 auto n_bytes = ggml_nbytes(t); 515 516 if (read_data.size() < n_bytes) { 517 read_data.resize(n_bytes); 518 } 519 520 auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); 521 f_input.seekg(offset); 522 f_input.read((char *)read_data.data(), n_bytes); 523 524 // write tensor data + padding 525 fout.write((const char *)read_data.data(), n_bytes); 526 zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); 527 } 528 529 gguf_free(ctx_gguf); 530 ggml_free(ctx_meta); 531 f_input.close(); 532 fprintf(stderr, "\033[3Ddone\n"); 533 } 534 535 { 536 // go back to beginning of file and write the updated metadata 537 fout.seekp(0); 538 std::vector<uint8_t> data(gguf_get_meta_size(ctx_out)); 539 gguf_get_meta_data(ctx_out, data.data()); 540 fout.write((const char *)data.data(), data.size()); 541 542 fout.close(); 543 gguf_free(ctx_out); 544 } 545 546 fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n", 547 __func__, split_params.output.c_str(), n_split, total_tensors); 548 } 549 550 int main(int argc, const char ** argv) { 551 split_params params; 552 split_params_parse(argc, argv, params); 553 554 switch (params.operation) { 555 case SPLIT_OP_SPLIT: gguf_split(params); 556 break; 557 case SPLIT_OP_MERGE: gguf_merge(params); 558 break; 559 default: split_print_usage(argv[0]); 560 exit(EXIT_FAILURE); 561 } 562 563 return 0; 564 }