eval-callback.cpp
1 #include "common.h" 2 #include "llama.h" 3 #include "ggml.h" 4 5 #include <cstdio> 6 #include <random> 7 #include <string> 8 #include <tuple> 9 #include <vector> 10 11 /** 12 * This the arbitrary data which will be passed to each callback. 13 * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. 14 */ 15 struct callback_data { 16 std::vector<uint8_t> data; 17 }; 18 19 static std::string ggml_ne_string(const ggml_tensor * t) { 20 std::string str; 21 for (int i = 0; i < GGML_MAX_DIMS; ++i) { 22 str += std::to_string(t->ne[i]); 23 if (i + 1 < GGML_MAX_DIMS) { 24 str += ", "; 25 } 26 } 27 return str; 28 } 29 30 static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { 31 GGML_ASSERT(n > 0); 32 float sum = 0; 33 for (int64_t i3 = 0; i3 < ne[3]; i3++) { 34 printf(" [\n"); 35 for (int64_t i2 = 0; i2 < ne[2]; i2++) { 36 if (i2 == n && ne[2] > 2*n) { 37 printf(" ..., \n"); 38 i2 = ne[2] - n; 39 } 40 printf(" [\n"); 41 for (int64_t i1 = 0; i1 < ne[1]; i1++) { 42 if (i1 == n && ne[1] > 2*n) { 43 printf(" ..., \n"); 44 i1 = ne[1] - n; 45 } 46 printf(" ["); 47 for (int64_t i0 = 0; i0 < ne[0]; i0++) { 48 if (i0 == n && ne[0] > 2*n) { 49 printf("..., "); 50 i0 = ne[0] - n; 51 } 52 size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; 53 float v; 54 if (type == GGML_TYPE_F16) { 55 v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); 56 } else if (type == GGML_TYPE_F32) { 57 v = *(float *) &data[i]; 58 } else if (type == GGML_TYPE_I32) { 59 v = (float) *(int32_t *) &data[i]; 60 } else if (type == GGML_TYPE_I16) { 61 v = (float) *(int16_t *) &data[i]; 62 } else if (type == GGML_TYPE_I8) { 63 v = (float) *(int8_t *) &data[i]; 64 } else { 65 GGML_ASSERT(false); 66 } 67 printf("%12.4f", v); 68 sum += v; 69 if (i0 < ne[0] - 1) printf(", "); 70 } 71 printf("],\n"); 72 } 73 printf(" ],\n"); 74 } 75 printf(" ]\n"); 76 printf(" sum = %f\n", sum); 77 } 78 } 79 80 /** 81 * GGML operations callback during the graph execution. 82 * 83 * @param t current tensor 84 * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor 85 * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. 86 * see ggml_backend_sched_eval_callback 87 * @param user_data user data to pass at each call back 88 * @return true to receive data or continue the graph, false otherwise 89 */ 90 static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { 91 auto * cb_data = (callback_data *) user_data; 92 93 const struct ggml_tensor * src0 = t->src[0]; 94 const struct ggml_tensor * src1 = t->src[1]; 95 96 if (ask) { 97 return true; // Always retrieve data 98 } 99 100 char src1_str[128] = {0}; 101 if (src1) { 102 sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); 103 } 104 105 printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, 106 t->name, ggml_type_name(t->type), ggml_op_desc(t), 107 src0->name, ggml_ne_string(src0).c_str(), 108 src1 ? src1_str : "", 109 ggml_ne_string(t).c_str()); 110 111 112 // copy the data from the GPU memory if needed 113 const bool is_host = ggml_backend_buffer_is_host(t->buffer); 114 115 if (!is_host) { 116 auto n_bytes = ggml_nbytes(t); 117 cb_data->data.resize(n_bytes); 118 ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); 119 } 120 121 if (!ggml_is_quantized(t->type)) { 122 uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); 123 ggml_print_tensor(data, t->type, t->ne, t->nb, 3); 124 } 125 126 return true; 127 } 128 129 static bool run(llama_context * ctx, const gpt_params & params) { 130 const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); 131 132 std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos); 133 134 if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { 135 fprintf(stderr, "%s : failed to eval\n", __func__); 136 return false; 137 } 138 139 return true; 140 } 141 142 int main(int argc, char ** argv) { 143 callback_data cb_data; 144 145 gpt_params params; 146 147 if (!gpt_params_parse(argc, argv, params)) { 148 gpt_params_print_usage(argc, argv, params); 149 return 1; 150 } 151 152 print_build_info(); 153 154 std::mt19937 rng(params.seed); 155 156 llama_backend_init(); 157 llama_numa_init(params.numa); 158 159 // pass the callback to the backend scheduler 160 // it will be executed for each node during the graph computation 161 params.cb_eval = ggml_debug; 162 params.cb_eval_user_data = &cb_data; 163 params.warmup = false; 164 165 // init 166 llama_model * model; 167 llama_context * ctx; 168 std::tie(model, ctx) = llama_init_from_gpt_params(params); 169 if (model == nullptr || ctx == nullptr) { 170 fprintf(stderr, "%s : failed to init\n", __func__); 171 return 1; 172 } 173 174 // print system information 175 { 176 fprintf(stderr, "\n"); 177 fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); 178 } 179 180 bool OK = run(ctx, params); 181 if (!OK) { 182 return 1; 183 } 184 185 llama_print_timings(ctx); 186 187 llama_free(ctx); 188 llama_free_model(model); 189 190 llama_backend_free(); 191 192 return 0; 193 }