/ examples / benchmark / benchmark-matmult.cpp
benchmark-matmult.cpp
  1  #include "common.h"
  2  #include "ggml.h"
  3  
  4  #include <locale.h>
  5  #include <assert.h>
  6  #include <math.h>
  7  #include <cstring>
  8  #include <cstdio>
  9  #include <cinttypes>
 10  #include <unordered_map>
 11  #include <queue>
 12  #include <string.h>
 13  #include <cassert>
 14  #include <fstream>
 15  #include <string>
 16  #include <iterator>
 17  #include <algorithm>
 18  
 19  #if defined(_MSC_VER)
 20  #pragma warning(disable: 4244 4267) // possible loss of data
 21  #endif
 22  
 23  static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
 24      struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 25  
 26      if (plan.work_size > 0) {
 27          buf.resize(plan.work_size);
 28          plan.work_data = buf.data();
 29      }
 30  
 31      ggml_graph_compute(graph, &plan);
 32  }
 33  
 34  static float tensor_sum_elements(const ggml_tensor * tensor) {
 35      double sum = 0;
 36      if (tensor->type == GGML_TYPE_F32) {
 37          for (int j = 0; j < tensor->ne[1]; j++) {
 38              for (int k = 0; k < tensor->ne[0]; k++) {
 39                  sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
 40              }
 41          }
 42      }
 43      return sum;
 44  }
 45  
 46  static void tensor_dump(const ggml_tensor * tensor, const char * name) {
 47      printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
 48          tensor->type, ggml_type_name(tensor->type),
 49          tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
 50      float sum = tensor_sum_elements(tensor);
 51      printf("Sum of tensor %s is %6.2f\n", name, sum);
 52  }
 53  
 54  #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
 55  
 56  struct benchmark_params_struct {
 57      int32_t n_threads     = 1;
 58      int32_t n_iterations  = 10;
 59  };
 60  
 61  static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
 62      fprintf(stderr, "usage: %s [options]\n", argv[0]);
 63      fprintf(stderr, "\n");
 64      fprintf(stderr, "options:\n");
 65      fprintf(stderr, "  -h, --help            show this help message and exit\n");
 66      fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
 67      fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
 68      fprintf(stderr, "\n");
 69  }
 70  
 71  int main(int argc, char ** argv)  {
 72      struct benchmark_params_struct benchmark_params;
 73  
 74      bool invalid_param = false;
 75      std::string arg;
 76      for (int i = 1; i < argc; i++) {
 77          arg = argv[i];
 78  
 79          if (arg == "-t" || arg == "--threads") {
 80              if (++i >= argc) {
 81                  invalid_param = true;
 82                  break;
 83              }
 84              benchmark_params.n_threads = std::stoi(argv[i]);
 85          } else if (arg == "-i" || arg == "--iter") {
 86              if (++i >= argc) {
 87                  invalid_param = true;
 88                  break;
 89              }
 90              benchmark_params.n_iterations = std::stoi(argv[i]);
 91          }  else if (arg == "-h" || arg == "--help") {
 92              print_usage(argc, argv, benchmark_params);
 93              exit(0);
 94          }
 95      }
 96      if (invalid_param) {
 97          fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
 98          print_usage(argc, argv, benchmark_params);
 99          exit(1);
100      }
101  
102      print_build_info();
103      printf("Starting Test\n");
104  
105      // create the ggml context
106      struct ggml_context * ctx;
107      //const int sizex = 4096;
108      //const int sizey = 11008;
109  
110  #undef VERBOSE_DEBUGGING
111  #ifndef VERBOSE_DEBUGGING
112      const int sizey = 4096;
113      const int sizex = 11008;
114      const int sizez = 128;
115  #else
116      /* Working - let's increase size */
117      const int sizey = 1;
118      const int sizex = (8*32);
119      const int sizez = 1;
120  
121      /*const int sizey = 1;
122      const int sizex = 3*(8*32);
123      const int sizez = 1;*/
124  #endif
125  
126      //printf("Memsize required = %i\n", sizex*sizex);
127  
128      // TODO: perform the bench for all types or for a user specified type
129      const ggml_type qtype = GGML_TYPE_Q4_1;
130  
131      size_t ctx_size = 0;
132      ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
133      ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
134      ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
135      ctx_size += ggml_row_size(qtype,         sizex*sizey);
136      ctx_size += ggml_row_size(qtype,         sizex*sizey);
137      ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
138      ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
139      ctx_size += 1024*1024*16;
140  
141      printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
142  
143      struct ggml_init_params params = {
144          /*.mem_size   =*/ ctx_size,
145          /*.mem_buffer =*/ NULL,
146          /* no_alloc   =*/ 0
147      };
148  
149      ctx = ggml_init(params);
150      if (!ctx) {
151          fprintf(stderr, "%s: ggml_init() failed\n", __func__);
152          return 1;
153      }
154  
155  
156      printf("Creating new tensors\n");
157      // printf("Creating new tensor m1\n");
158      struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
159      ggml_set_f32(m11, 1.0f);
160  
161      // printf("Creating new tensor m1\n");
162      struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
163      ggml_set_f32(m12, 1.5f);
164  
165      // printf("Creating new tensor m2\n");
166      struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
167      ggml_set_f32(m2, 2.0f);
168  
169      printf("\n------ Test 1 - Matrix Mult via F32 code\n");
170      // printf("Creating new tensor m11xm2\n");
171      struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
172  
173      // printf("Creating compute graph\n");
174      struct ggml_cgraph * gf = ggml_new_graph(ctx);
175      ggml_build_forward_expand(gf, m11xm2);
176  
177      printf("n_threads=%i\n", benchmark_params.n_threads);
178  
179      TENSOR_DUMP(m11);
180      TENSOR_DUMP(m2);
181  
182      std::vector<uint8_t> work_buffer;
183  
184      ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
185  
186      TENSOR_DUMP(gf->nodes[0]);
187  
188      printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
189  
190      int32_t nelements = sizex*sizey;
191  
192      // Set up a the benchmark matrices
193      // printf("Creating new tensor q11 & Running quantize\n");
194      struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
195      ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
196  
197      // Set up a the compute graph
198      // printf("Creating new tensor q31\n");
199      struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
200  
201      // printf("Creating compute graph\n");
202      struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
203      ggml_build_forward_expand(gf31, q31);
204  
205      // Set up a second graph computation to make sure we override the CPU cache lines
206      // printf("Creating new tensor q12 & Running quantize\n");
207      struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
208      ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
209  
210      // printf("Creating new tensor q32\n");
211      struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
212  
213      //printf("Creating compute graph\n");
214      struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
215      ggml_build_forward_expand(gf32, q32);
216      printf("n_threads=%i\n", benchmark_params.n_threads);
217  
218      const int dimx = sizex;
219      const int dimy = sizey;
220      const int dimz = sizez;
221      long long int flops_per_dot_product = dimy + dimy;
222      long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
223      printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
224  
225  
226      // Let's use the F32 result from above as a reference for the quantized multiplication
227      float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
228  
229      printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
230      printf("=====================================================================================\n");
231  
232      double  gflops_sum = 0;
233      for (int i=0;i<benchmark_params.n_iterations ;i++) {
234  
235          long long int start = ggml_time_us();
236          //printf("Running ggml_graph_compute\n");
237          ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
238  
239          long long int stop = ggml_time_us();
240          long long int usec = stop-start;
241          double gflops = (double)(flops_per_matrix)/usec/1000.0;
242          gflops_sum += gflops;
243          printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
244              i,
245              benchmark_params.n_threads,
246              sizex, sizey, sizez, flops_per_matrix,
247              usec,gflops);
248  
249  #ifdef VERBOSE_DEBUGGING
250          TENSOR_DUMP("res",gf31.nodes[0])
251  #endif
252  
253          // Check that the matrix multiplication result is in the right ballpark
254          // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
255          float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
256          float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
257          float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
258  
259          if (delta > allowed_delta)  {
260              printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
261                  sum_of_F32_reference,
262                  sum_of_Q4_result,
263                  delta,
264                  allowed_delta
265              );
266              exit(0);
267          }
268  
269          // Running a different graph computation to make sure we override the CPU cache lines
270          ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
271      }
272      printf("\n");
273      printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
274      printf("=====================================================================================\n");
275  }