/ tests / test-quantize-perf.cpp
test-quantize-perf.cpp
  1  // Benchmark quantization specific functions on synthetic data
  2  
  3  #include "ggml.h"
  4  
  5  #undef NDEBUG
  6  #include <algorithm>
  7  #include <assert.h>
  8  #include <functional>
  9  #include <inttypes.h>
 10  #include <math.h>
 11  #include <memory>
 12  #include <stdio.h>
 13  #include <string>
 14  #include <vector>
 15  
 16  #if defined(_MSC_VER)
 17  #pragma warning(disable: 4244 4267) // possible loss of data
 18  #endif
 19  
 20  #define MAX_ALIGNMENT 64
 21  #define QK 32
 22  #define WARMUP 5
 23  #define ITERATIONS 10
 24  #define MAX_ITERATIONS 100000000
 25  
 26  #define L1_SIZE      32*128
 27  #define L2_SIZE     32*2048
 28  #define L3_SIZE    32*20480
 29  #define MEM_SIZE 32*2048000
 30  
 31  struct quantize_perf_params {
 32      std::vector<std::string> include_types;
 33      std::vector<size_t> test_sizes;
 34      size_t alignment_offset = 0;
 35      bool op_quantize_row_q_reference = false;
 36      bool op_quantize_row_q = false;
 37      bool op_dequantize_row_q = false;
 38      bool op_quantize_row_q_dot = false;
 39      bool op_vec_dot_q = false;
 40      int64_t iterations = ITERATIONS;
 41  };
 42  
 43  #if defined(__x86_64__) || defined(__i386__)
 44  
 45  #include <x86intrin.h>
 46  inline int64_t cpu_cycles() {
 47  // Rough way to detect new-ish CPUs
 48  #ifdef __POPCNT__
 49      unsigned int dummy;
 50      return __rdtscp(&dummy);
 51  #else
 52      return __rdtsc();
 53  #endif
 54  }
 55  
 56  #else
 57  
 58  #define cpu_cycles() 0
 59  
 60  #endif
 61  
 62  
 63  // Generate synthetic data
 64  static void generate_data(float offset, size_t n, float * dst) {
 65      for (size_t i = 0; i < n; i++) {
 66          dst[i] = 0.1 + 2*cosf(i + offset);
 67      }
 68  }
 69  
 70  static float gigabytes_per_second(size_t bytes, int64_t usecs) {
 71      return bytes / (float) usecs * 1000000 / (1024*1024*1024);
 72  }
 73  
 74  static void * align_with_offset(void * ptr, int offset) {
 75      size_t dummy_size = MAX_ALIGNMENT * 4;
 76      return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
 77  }
 78  
 79  static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
 80      int64_t min_time_us = INT64_MAX;
 81      int64_t total_time_us = 0;
 82      int64_t min_time_cycles = INT64_MAX;
 83      int64_t total_time_cycles = 0;
 84  
 85      for (int i = 0; i < WARMUP; i++) {
 86          func();
 87      }
 88  
 89      for (int i = 0; i < iterations; i++) {
 90          const int64_t start_time = ggml_time_us();
 91          const int64_t start_cycles = cpu_cycles();
 92  
 93          func();
 94  
 95          const int64_t end_cycles = cpu_cycles();
 96          const int64_t end_time = ggml_time_us();
 97  
 98          total_time_cycles += end_cycles - start_cycles;
 99          min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
100          total_time_us += end_time - start_time;
101          min_time_us = std::min(min_time_us, end_time - start_time);
102      }
103  
104      printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
105      printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * iterations));
106      printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * iterations, total_time_us));
107      printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us));
108  }
109  
110  static void usage(char * argv[]) {
111      printf("Benchmark quantization specific functions on synthetic data\n");
112      printf("\n");
113      printf("usage: %s [options]\n", argv[0]);
114      printf("\n");
115      printf("options: (default)\n");
116      printf("  -h, --help            show this help message and exit\n");
117      printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
118      printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
119      printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
120      printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
121      printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
122      printf("  --type TYPE           set test type as");
123      for (int i = 0; i < GGML_TYPE_COUNT; i++) {
124          ggml_type type = (ggml_type) i;
125          ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
126          if (ggml_type_name(type) != NULL) {
127              if (qfns.from_float && qfns.to_float) {
128                  printf(" %s", ggml_type_name(type));
129              }
130          }
131      }
132      printf(" (all)\n");
133      printf("  --alignment-offset OFFSET\n");
134      printf("                        set alignment offset as OFFSET (0)\n");
135      printf("  -i NUM, --iterations NUM\n");
136      printf("                        set test iteration number (%d)\n", ITERATIONS);
137  }
138  
139  int main(int argc, char * argv[]) {
140      quantize_perf_params params {};
141  
142      // read command line
143  
144      bool invalid_param = false;
145      std::string arg;
146      for (int i = 1; i < argc; i++) {
147          arg = argv[i];
148  
149          if (arg == "--size") {
150              if (++i >= argc) {
151                  invalid_param = true;
152                  break;
153              }
154              size_t size = std::stoi(argv[i]);
155              if (size % 32 != 0) {
156                  fprintf(stderr, "error: size %zu not divisible by 32\n", size);
157                  invalid_param = true;
158                  break;
159              }
160              params.test_sizes.push_back(size);
161          } else if (arg == "-3") {
162              // quick select sizes that probably fit in CPU caches
163              params.test_sizes.push_back(L1_SIZE);
164              params.test_sizes.push_back(L2_SIZE);
165              params.test_sizes.push_back(L3_SIZE);
166          } else if (arg == "-4") {
167              // quick select cache sizes + memory
168              params.test_sizes.push_back(L1_SIZE);
169              params.test_sizes.push_back(L2_SIZE);
170              params.test_sizes.push_back(L3_SIZE);
171              params.test_sizes.push_back(MEM_SIZE);
172          } else if (arg == "--op") {
173              if (++i >= argc) {
174                  invalid_param = true;
175                  break;
176              }
177              std::string op {argv[i]};
178              if (op == "quantize_row_q_reference") {
179                  params.op_quantize_row_q_reference = true;
180              } else if (op == "quantize_row_q") {
181                  params.op_quantize_row_q = true;
182              } else if (op == "dequantize_row_q") {
183                  params.op_dequantize_row_q = true;
184              } else if (op == "quantize_row_q_dot") {
185                  params.op_quantize_row_q_dot = true;
186              } else if (op == "vec_dot_q") {
187                  params.op_vec_dot_q = true;
188              } else {
189                  invalid_param = true;
190                  break;
191              }
192          } else if (arg == "--type") {
193              if (++i >= argc) {
194                  invalid_param = true;
195                  break;
196              }
197              params.include_types.push_back(argv[i]);
198          } else if (arg == "--alignment-offset") {
199              if (++i >= argc) {
200                  invalid_param = true;
201                  break;
202              }
203              int alignment = std::stoi(argv[i]);
204              if (alignment < 0 || alignment > MAX_ALIGNMENT) {
205              fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
206                  invalid_param = true;
207                  break;
208              }
209              params.alignment_offset = alignment;
210          } else if ((arg == "-i") || (arg == "--iterations")) {
211              if (++i >= argc) {
212                  invalid_param = true;
213                  break;
214              }
215              int number = std::stoi(argv[i]);
216              if (number < 0 || number > MAX_ITERATIONS) {
217              fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
218                  invalid_param = true;
219                  break;
220              }
221              params.iterations = number;
222          } else if ((arg == "-h") || (arg == "--help")) {
223              usage(argv);
224              return 1;
225          } else {
226              fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
227              return 1;
228          }
229      }
230      if (invalid_param) {
231          fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
232          return 1;
233      }
234  
235      if (params.test_sizes.empty()) {
236          params.test_sizes.push_back(L1_SIZE);
237      }
238      if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
239          params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
240      }
241  
242      std::sort(params.test_sizes.begin(), params.test_sizes.end());
243      size_t largest = params.test_sizes.back();
244  
245      std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
246      std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
247      std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
248      std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
249      std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
250  
251      float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
252      float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
253      float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
254      float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
255      float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
256  
257      generate_data(0, largest, test_data1);
258      generate_data(1, largest, test_data2);
259  
260      int64_t iterations = params.iterations;
261  
262  
263      // Initialize GGML, ensures float conversion tables are initialized
264      struct ggml_init_params ggml_params = {
265          /* .mem_size   = */ 1*1024,
266          /* .mem_buffer = */ NULL,
267          /* .no_alloc   = */ true,
268      };
269      struct ggml_context * ctx = ggml_init(ggml_params);
270  
271      for (int i = 0; i < GGML_TYPE_COUNT; i++) {
272          ggml_type type = (ggml_type) i;
273          ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
274          if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
275              continue;
276          }
277  
278          if (qfns.from_float && qfns.to_float) {
279              printf("%s\n", ggml_type_name(type));
280  
281              ggml_quantize_init(type);
282  
283              if (params.op_quantize_row_q_reference) {
284                  printf("  quantize_row_q_reference\n");
285                  for (size_t size : params.test_sizes) {
286                      printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
287                      auto quantize_fn = [&](void) -> float {
288                          qfns.from_float_reference(test_data1, test_q1, size);
289                          return test_q1[0];
290                      };
291                      size_t quantized_size = ggml_row_size(type, size);
292                      benchmark_function(size, quantized_size, iterations, quantize_fn);
293                  }
294                  printf("\n");
295              }
296  
297              if (params.op_quantize_row_q) {
298                  printf("  quantize_row_q\n");
299                  for (size_t size : params.test_sizes) {
300                      printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
301                      auto quantize_fn = [&](void) -> float {
302                          qfns.from_float(test_data1, test_q1, size);
303                          return test_q1[0];
304                      };
305                      size_t quantized_size = ggml_row_size(type, size);
306                      benchmark_function(size, quantized_size, iterations, quantize_fn);
307                  }
308                  printf("\n");
309              }
310  
311              if (params.op_dequantize_row_q) {
312                  printf("  dequantize_row_q\n");
313                  qfns.from_float(test_data1, test_q1, largest);
314                  for (size_t size : params.test_sizes) {
315                      printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
316                      auto quantize_fn = [&](void) -> float {
317                          qfns.to_float(test_q1, test_out, size);
318                          return test_out[0];
319                      };
320                      size_t quantized_size = ggml_row_size(type, size);
321                      benchmark_function(size, quantized_size, iterations, quantize_fn);
322                  }
323                  printf("\n");
324              }
325  
326              if (params.op_quantize_row_q_dot) {
327                  printf("  quantize_row_q_dot\n");
328                  for (size_t size : params.test_sizes) {
329                      printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
330                      auto quantize_fn = [&](void) -> float {
331                          auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
332                          vdot.from_float(test_data1, test_q1, size);
333                          return test_q1[0];
334                      };
335                      size_t quantized_size = ggml_row_size(type, size);
336                      benchmark_function(size, quantized_size, iterations, quantize_fn);
337                  }
338                  printf("\n");
339              }
340  
341              if (params.op_vec_dot_q) {
342                  printf("  vec_dot_q\n");
343                  qfns.from_float(test_data1, test_q1, largest);
344                  qfns.from_float(test_data2, test_q2, largest);
345                  for (size_t size : params.test_sizes) {
346                      printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
347                      auto quantize_fn = [&](void) -> float {
348                          float result;
349                          qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
350                          return result;
351                      };
352                      size_t quantized_size = ggml_row_size(type, size);
353                      benchmark_function(size, quantized_size, iterations, quantize_fn);
354                  }
355                  printf("\n");
356              }
357          }
358      }
359  
360      ggml_free(ctx);
361  
362      return 0;
363  }