test-quantize-perf.cpp
1 // Benchmark quantization specific functions on synthetic data 2 3 #include "ggml.h" 4 5 #undef NDEBUG 6 #include <algorithm> 7 #include <assert.h> 8 #include <functional> 9 #include <inttypes.h> 10 #include <math.h> 11 #include <memory> 12 #include <stdio.h> 13 #include <string> 14 #include <vector> 15 16 #if defined(_MSC_VER) 17 #pragma warning(disable: 4244 4267) // possible loss of data 18 #endif 19 20 #define MAX_ALIGNMENT 64 21 #define QK 32 22 #define WARMUP 5 23 #define ITERATIONS 10 24 #define MAX_ITERATIONS 100000000 25 26 #define L1_SIZE 32*128 27 #define L2_SIZE 32*2048 28 #define L3_SIZE 32*20480 29 #define MEM_SIZE 32*2048000 30 31 struct quantize_perf_params { 32 std::vector<std::string> include_types; 33 std::vector<size_t> test_sizes; 34 size_t alignment_offset = 0; 35 bool op_quantize_row_q_reference = false; 36 bool op_quantize_row_q = false; 37 bool op_dequantize_row_q = false; 38 bool op_quantize_row_q_dot = false; 39 bool op_vec_dot_q = false; 40 int64_t iterations = ITERATIONS; 41 }; 42 43 #if defined(__x86_64__) || defined(__i386__) 44 45 #include <x86intrin.h> 46 inline int64_t cpu_cycles() { 47 // Rough way to detect new-ish CPUs 48 #ifdef __POPCNT__ 49 unsigned int dummy; 50 return __rdtscp(&dummy); 51 #else 52 return __rdtsc(); 53 #endif 54 } 55 56 #else 57 58 #define cpu_cycles() 0 59 60 #endif 61 62 63 // Generate synthetic data 64 static void generate_data(float offset, size_t n, float * dst) { 65 for (size_t i = 0; i < n; i++) { 66 dst[i] = 0.1 + 2*cosf(i + offset); 67 } 68 } 69 70 static float gigabytes_per_second(size_t bytes, int64_t usecs) { 71 return bytes / (float) usecs * 1000000 / (1024*1024*1024); 72 } 73 74 static void * align_with_offset(void * ptr, int offset) { 75 size_t dummy_size = MAX_ALIGNMENT * 4; 76 return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; 77 } 78 79 static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) { 80 int64_t min_time_us = INT64_MAX; 81 int64_t total_time_us = 0; 82 int64_t min_time_cycles = INT64_MAX; 83 int64_t total_time_cycles = 0; 84 85 for (int i = 0; i < WARMUP; i++) { 86 func(); 87 } 88 89 for (int i = 0; i < iterations; i++) { 90 const int64_t start_time = ggml_time_us(); 91 const int64_t start_cycles = cpu_cycles(); 92 93 func(); 94 95 const int64_t end_cycles = cpu_cycles(); 96 const int64_t end_time = ggml_time_us(); 97 98 total_time_cycles += end_cycles - start_cycles; 99 min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles); 100 total_time_us += end_time - start_time; 101 min_time_us = std::min(min_time_us, end_time - start_time); 102 } 103 104 printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size); 105 printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations)); 106 printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * iterations, total_time_us)); 107 printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us)); 108 } 109 110 static void usage(char * argv[]) { 111 printf("Benchmark quantization specific functions on synthetic data\n"); 112 printf("\n"); 113 printf("usage: %s [options]\n", argv[0]); 114 printf("\n"); 115 printf("options: (default)\n"); 116 printf(" -h, --help show this help message and exit\n"); 117 printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE); 118 printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE); 119 printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE); 120 printf(" --op OP set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n"); 121 printf(" quantize_row_q_dot, vec_dot_q (all)\n"); 122 printf(" --type TYPE set test type as"); 123 for (int i = 0; i < GGML_TYPE_COUNT; i++) { 124 ggml_type type = (ggml_type) i; 125 ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); 126 if (ggml_type_name(type) != NULL) { 127 if (qfns.from_float && qfns.to_float) { 128 printf(" %s", ggml_type_name(type)); 129 } 130 } 131 } 132 printf(" (all)\n"); 133 printf(" --alignment-offset OFFSET\n"); 134 printf(" set alignment offset as OFFSET (0)\n"); 135 printf(" -i NUM, --iterations NUM\n"); 136 printf(" set test iteration number (%d)\n", ITERATIONS); 137 } 138 139 int main(int argc, char * argv[]) { 140 quantize_perf_params params {}; 141 142 // read command line 143 144 bool invalid_param = false; 145 std::string arg; 146 for (int i = 1; i < argc; i++) { 147 arg = argv[i]; 148 149 if (arg == "--size") { 150 if (++i >= argc) { 151 invalid_param = true; 152 break; 153 } 154 size_t size = std::stoi(argv[i]); 155 if (size % 32 != 0) { 156 fprintf(stderr, "error: size %zu not divisible by 32\n", size); 157 invalid_param = true; 158 break; 159 } 160 params.test_sizes.push_back(size); 161 } else if (arg == "-3") { 162 // quick select sizes that probably fit in CPU caches 163 params.test_sizes.push_back(L1_SIZE); 164 params.test_sizes.push_back(L2_SIZE); 165 params.test_sizes.push_back(L3_SIZE); 166 } else if (arg == "-4") { 167 // quick select cache sizes + memory 168 params.test_sizes.push_back(L1_SIZE); 169 params.test_sizes.push_back(L2_SIZE); 170 params.test_sizes.push_back(L3_SIZE); 171 params.test_sizes.push_back(MEM_SIZE); 172 } else if (arg == "--op") { 173 if (++i >= argc) { 174 invalid_param = true; 175 break; 176 } 177 std::string op {argv[i]}; 178 if (op == "quantize_row_q_reference") { 179 params.op_quantize_row_q_reference = true; 180 } else if (op == "quantize_row_q") { 181 params.op_quantize_row_q = true; 182 } else if (op == "dequantize_row_q") { 183 params.op_dequantize_row_q = true; 184 } else if (op == "quantize_row_q_dot") { 185 params.op_quantize_row_q_dot = true; 186 } else if (op == "vec_dot_q") { 187 params.op_vec_dot_q = true; 188 } else { 189 invalid_param = true; 190 break; 191 } 192 } else if (arg == "--type") { 193 if (++i >= argc) { 194 invalid_param = true; 195 break; 196 } 197 params.include_types.push_back(argv[i]); 198 } else if (arg == "--alignment-offset") { 199 if (++i >= argc) { 200 invalid_param = true; 201 break; 202 } 203 int alignment = std::stoi(argv[i]); 204 if (alignment < 0 || alignment > MAX_ALIGNMENT) { 205 fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT); 206 invalid_param = true; 207 break; 208 } 209 params.alignment_offset = alignment; 210 } else if ((arg == "-i") || (arg == "--iterations")) { 211 if (++i >= argc) { 212 invalid_param = true; 213 break; 214 } 215 int number = std::stoi(argv[i]); 216 if (number < 0 || number > MAX_ITERATIONS) { 217 fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS); 218 invalid_param = true; 219 break; 220 } 221 params.iterations = number; 222 } else if ((arg == "-h") || (arg == "--help")) { 223 usage(argv); 224 return 1; 225 } else { 226 fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); 227 return 1; 228 } 229 } 230 if (invalid_param) { 231 fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); 232 return 1; 233 } 234 235 if (params.test_sizes.empty()) { 236 params.test_sizes.push_back(L1_SIZE); 237 } 238 if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) { 239 params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true; 240 } 241 242 std::sort(params.test_sizes.begin(), params.test_sizes.end()); 243 size_t largest = params.test_sizes.back(); 244 245 std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2); 246 std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2); 247 std::vector<uint8_t> test_q1_v (largest*4 + MAX_ALIGNMENT*2); 248 std::vector<uint8_t> test_q2_v (largest*4 + MAX_ALIGNMENT*2); 249 std::vector<uint8_t> test_out_v (largest*4 + MAX_ALIGNMENT*2); 250 251 float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset); 252 float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset); 253 float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset); 254 float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset); 255 float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset); 256 257 generate_data(0, largest, test_data1); 258 generate_data(1, largest, test_data2); 259 260 int64_t iterations = params.iterations; 261 262 263 // Initialize GGML, ensures float conversion tables are initialized 264 struct ggml_init_params ggml_params = { 265 /* .mem_size = */ 1*1024, 266 /* .mem_buffer = */ NULL, 267 /* .no_alloc = */ true, 268 }; 269 struct ggml_context * ctx = ggml_init(ggml_params); 270 271 for (int i = 0; i < GGML_TYPE_COUNT; i++) { 272 ggml_type type = (ggml_type) i; 273 ggml_type_traits_t qfns = ggml_internal_get_type_traits(type); 274 if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) { 275 continue; 276 } 277 278 if (qfns.from_float && qfns.to_float) { 279 printf("%s\n", ggml_type_name(type)); 280 281 ggml_quantize_init(type); 282 283 if (params.op_quantize_row_q_reference) { 284 printf(" quantize_row_q_reference\n"); 285 for (size_t size : params.test_sizes) { 286 printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); 287 auto quantize_fn = [&](void) -> float { 288 qfns.from_float_reference(test_data1, test_q1, size); 289 return test_q1[0]; 290 }; 291 size_t quantized_size = ggml_row_size(type, size); 292 benchmark_function(size, quantized_size, iterations, quantize_fn); 293 } 294 printf("\n"); 295 } 296 297 if (params.op_quantize_row_q) { 298 printf(" quantize_row_q\n"); 299 for (size_t size : params.test_sizes) { 300 printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); 301 auto quantize_fn = [&](void) -> float { 302 qfns.from_float(test_data1, test_q1, size); 303 return test_q1[0]; 304 }; 305 size_t quantized_size = ggml_row_size(type, size); 306 benchmark_function(size, quantized_size, iterations, quantize_fn); 307 } 308 printf("\n"); 309 } 310 311 if (params.op_dequantize_row_q) { 312 printf(" dequantize_row_q\n"); 313 qfns.from_float(test_data1, test_q1, largest); 314 for (size_t size : params.test_sizes) { 315 printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); 316 auto quantize_fn = [&](void) -> float { 317 qfns.to_float(test_q1, test_out, size); 318 return test_out[0]; 319 }; 320 size_t quantized_size = ggml_row_size(type, size); 321 benchmark_function(size, quantized_size, iterations, quantize_fn); 322 } 323 printf("\n"); 324 } 325 326 if (params.op_quantize_row_q_dot) { 327 printf(" quantize_row_q_dot\n"); 328 for (size_t size : params.test_sizes) { 329 printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); 330 auto quantize_fn = [&](void) -> float { 331 auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type); 332 vdot.from_float(test_data1, test_q1, size); 333 return test_q1[0]; 334 }; 335 size_t quantized_size = ggml_row_size(type, size); 336 benchmark_function(size, quantized_size, iterations, quantize_fn); 337 } 338 printf("\n"); 339 } 340 341 if (params.op_vec_dot_q) { 342 printf(" vec_dot_q\n"); 343 qfns.from_float(test_data1, test_q1, largest); 344 qfns.from_float(test_data2, test_q2, largest); 345 for (size_t size : params.test_sizes) { 346 printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); 347 auto quantize_fn = [&](void) -> float { 348 float result; 349 qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1); 350 return result; 351 }; 352 size_t quantized_size = ggml_row_size(type, size); 353 benchmark_function(size, quantized_size, iterations, quantize_fn); 354 } 355 printf("\n"); 356 } 357 } 358 } 359 360 ggml_free(ctx); 361 362 return 0; 363 }