batched-bench.cpp
1 #include "common.h" 2 #include "llama.h" 3 4 #include <algorithm> 5 #include <cmath> 6 #include <cstdio> 7 #include <string> 8 #include <vector> 9 10 // mutates the input string 11 static std::vector<int> parse_list(char * p) { 12 std::vector<int> ret; 13 14 char * q = p; 15 16 while (*p) { 17 if (*p == ',') { 18 *p = '\0'; 19 ret.push_back(std::atoi(q)); 20 q = p + 1; 21 } 22 23 ++p; 24 } 25 26 ret.push_back(std::atoi(q)); 27 28 return ret; 29 } 30 31 static void print_usage(int argc, char ** argv, const gpt_params & params) { 32 gpt_params_print_usage(argc, argv, params); 33 34 LOG_TEE("\nexample usage:\n"); 35 LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); 36 LOG_TEE("\n"); 37 } 38 39 int main(int argc, char ** argv) { 40 gpt_params params; 41 42 if (!gpt_params_parse(argc, argv, params)) { 43 print_usage(argc, argv, params); 44 return 1; 45 } 46 47 int is_pp_shared = params.is_pp_shared; 48 49 std::vector<int> n_pp = params.n_pp; 50 std::vector<int> n_tg = params.n_tg; 51 std::vector<int> n_pl = params.n_pl; 52 53 // init LLM 54 55 llama_backend_init(); 56 llama_numa_init(params.numa); 57 58 // initialize the model 59 60 llama_model_params model_params = llama_model_params_from_gpt_params(params); 61 62 llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); 63 64 if (model == NULL) { 65 fprintf(stderr , "%s: error: unable to load model\n" , __func__); 66 return 1; 67 } 68 69 llama_context_params ctx_params = llama_context_params_from_gpt_params(params); 70 71 // ensure enough sequences are available 72 ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); 73 74 llama_context * ctx = llama_new_context_with_model(model, ctx_params); 75 76 if (ctx == NULL) { 77 fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); 78 return 1; 79 } 80 81 const int32_t n_kv_max = llama_n_ctx(ctx); 82 83 llama_batch batch = llama_batch_init(n_kv_max, 0, 1); 84 85 // decode in batches of ctx_params.n_batch tokens 86 auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) { 87 for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { 88 const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); 89 90 llama_batch batch_view = { 91 n_tokens, 92 batch.token + i, 93 nullptr, 94 batch.pos + i, 95 batch.n_seq_id + i, 96 batch.seq_id + i, 97 batch.logits + i, 98 0, 0, 0, // unused 99 }; 100 101 const int ret = llama_decode(ctx, batch_view); 102 if (ret != 0) { 103 LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); 104 return false; 105 } 106 107 llama_synchronize(ctx); 108 } 109 110 return true; 111 }; 112 113 // warm up 114 { 115 for (int i = 0; i < 16; ++i) { 116 llama_batch_add(batch, 0, i, { 0 }, false); 117 } 118 119 if (!decode_helper(ctx, batch, ctx_params.n_batch)) { 120 LOG_TEE("%s: llama_decode() failed\n", __func__); 121 return 1; 122 } 123 } 124 125 LOG_TEE("\n"); 126 LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); 127 LOG_TEE("\n"); 128 129 LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); 130 LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); 131 132 for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { 133 for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) { 134 for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) { 135 const int pp = n_pp[i_pp]; 136 const int tg = n_tg[i_tg]; 137 const int pl = n_pl[i_pl]; 138 139 const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg); 140 141 if (n_ctx_req > n_kv_max) { 142 continue; 143 } 144 145 llama_batch_clear(batch); 146 147 for (int i = 0; i < pp; ++i) { 148 for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { 149 llama_batch_add(batch, 0, i, { j }, false); 150 } 151 } 152 batch.logits[batch.n_tokens - 1] = true; 153 154 const auto t_pp_start = ggml_time_us(); 155 156 llama_kv_cache_clear(ctx); 157 158 if (!decode_helper(ctx, batch, ctx_params.n_batch)) { 159 LOG_TEE("%s: llama_decode() failed\n", __func__); 160 return 1; 161 } 162 163 if (is_pp_shared) { 164 for (int32_t i = 1; i < pl; ++i) { 165 llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); 166 } 167 } 168 169 const auto t_pp_end = ggml_time_us(); 170 171 const auto t_tg_start = ggml_time_us(); 172 173 for (int i = 0; i < tg; ++i) { 174 llama_batch_clear(batch); 175 176 for (int j = 0; j < pl; ++j) { 177 llama_batch_add(batch, 0, pp + i, { j }, true); 178 } 179 180 if (!decode_helper(ctx, batch, ctx_params.n_batch)) { 181 LOG_TEE("%s: llama_decode() failed\n", __func__); 182 return 1; 183 } 184 } 185 186 const auto t_tg_end = ggml_time_us(); 187 188 const int32_t n_kv = n_ctx_req; 189 190 const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f; 191 const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f; 192 const float t = t_pp + t_tg; 193 194 const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp; 195 const float speed_tg = pl*tg / t_tg; 196 const float speed = n_kv / t; 197 198 LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); 199 } 200 } 201 } 202 203 llama_print_timings(ctx); 204 205 llama_batch_free(batch); 206 207 llama_free(ctx); 208 llama_free_model(model); 209 210 llama_backend_free(); 211 212 fprintf(stderr, "\n\n"); 213 214 return 0; 215 }