benchmark-matmult.cpp
1 #include "common.h" 2 #include "ggml.h" 3 4 #include <locale.h> 5 #include <assert.h> 6 #include <math.h> 7 #include <cstring> 8 #include <cstdio> 9 #include <cinttypes> 10 #include <unordered_map> 11 #include <queue> 12 #include <string.h> 13 #include <cassert> 14 #include <fstream> 15 #include <string> 16 #include <iterator> 17 #include <algorithm> 18 19 #if defined(_MSC_VER) 20 #pragma warning(disable: 4244 4267) // possible loss of data 21 #endif 22 23 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { 24 struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); 25 26 if (plan.work_size > 0) { 27 buf.resize(plan.work_size); 28 plan.work_data = buf.data(); 29 } 30 31 ggml_graph_compute(graph, &plan); 32 } 33 34 static float tensor_sum_elements(const ggml_tensor * tensor) { 35 double sum = 0; 36 if (tensor->type == GGML_TYPE_F32) { 37 for (int j = 0; j < tensor->ne[1]; j++) { 38 for (int k = 0; k < tensor->ne[0]; k++) { 39 sum += ((float *) tensor->data)[j*tensor->ne[0] + k]; 40 } 41 } 42 } 43 return sum; 44 } 45 46 static void tensor_dump(const ggml_tensor * tensor, const char * name) { 47 printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, 48 tensor->type, ggml_type_name(tensor->type), 49 tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); 50 float sum = tensor_sum_elements(tensor); 51 printf("Sum of tensor %s is %6.2f\n", name, sum); 52 } 53 54 #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) 55 56 struct benchmark_params_struct { 57 int32_t n_threads = 1; 58 int32_t n_iterations = 10; 59 }; 60 61 static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { 62 fprintf(stderr, "usage: %s [options]\n", argv[0]); 63 fprintf(stderr, "\n"); 64 fprintf(stderr, "options:\n"); 65 fprintf(stderr, " -h, --help show this help message and exit\n"); 66 fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); 67 fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations); 68 fprintf(stderr, "\n"); 69 } 70 71 int main(int argc, char ** argv) { 72 struct benchmark_params_struct benchmark_params; 73 74 bool invalid_param = false; 75 std::string arg; 76 for (int i = 1; i < argc; i++) { 77 arg = argv[i]; 78 79 if (arg == "-t" || arg == "--threads") { 80 if (++i >= argc) { 81 invalid_param = true; 82 break; 83 } 84 benchmark_params.n_threads = std::stoi(argv[i]); 85 } else if (arg == "-i" || arg == "--iter") { 86 if (++i >= argc) { 87 invalid_param = true; 88 break; 89 } 90 benchmark_params.n_iterations = std::stoi(argv[i]); 91 } else if (arg == "-h" || arg == "--help") { 92 print_usage(argc, argv, benchmark_params); 93 exit(0); 94 } 95 } 96 if (invalid_param) { 97 fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); 98 print_usage(argc, argv, benchmark_params); 99 exit(1); 100 } 101 102 print_build_info(); 103 printf("Starting Test\n"); 104 105 // create the ggml context 106 struct ggml_context * ctx; 107 //const int sizex = 4096; 108 //const int sizey = 11008; 109 110 #undef VERBOSE_DEBUGGING 111 #ifndef VERBOSE_DEBUGGING 112 const int sizey = 4096; 113 const int sizex = 11008; 114 const int sizez = 128; 115 #else 116 /* Working - let's increase size */ 117 const int sizey = 1; 118 const int sizex = (8*32); 119 const int sizez = 1; 120 121 /*const int sizey = 1; 122 const int sizex = 3*(8*32); 123 const int sizez = 1;*/ 124 #endif 125 126 //printf("Memsize required = %i\n", sizex*sizex); 127 128 // TODO: perform the bench for all types or for a user specified type 129 const ggml_type qtype = GGML_TYPE_Q4_1; 130 131 size_t ctx_size = 0; 132 ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); 133 ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); 134 ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); 135 ctx_size += ggml_row_size(qtype, sizex*sizey); 136 ctx_size += ggml_row_size(qtype, sizex*sizey); 137 ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS 138 ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS 139 ctx_size += 1024*1024*16; 140 141 printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); 142 143 struct ggml_init_params params = { 144 /*.mem_size =*/ ctx_size, 145 /*.mem_buffer =*/ NULL, 146 /* no_alloc =*/ 0 147 }; 148 149 ctx = ggml_init(params); 150 if (!ctx) { 151 fprintf(stderr, "%s: ggml_init() failed\n", __func__); 152 return 1; 153 } 154 155 156 printf("Creating new tensors\n"); 157 // printf("Creating new tensor m1\n"); 158 struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); 159 ggml_set_f32(m11, 1.0f); 160 161 // printf("Creating new tensor m1\n"); 162 struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); 163 ggml_set_f32(m12, 1.5f); 164 165 // printf("Creating new tensor m2\n"); 166 struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); 167 ggml_set_f32(m2, 2.0f); 168 169 printf("\n------ Test 1 - Matrix Mult via F32 code\n"); 170 // printf("Creating new tensor m11xm2\n"); 171 struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); 172 173 // printf("Creating compute graph\n"); 174 struct ggml_cgraph * gf = ggml_new_graph(ctx); 175 ggml_build_forward_expand(gf, m11xm2); 176 177 printf("n_threads=%i\n", benchmark_params.n_threads); 178 179 TENSOR_DUMP(m11); 180 TENSOR_DUMP(m2); 181 182 std::vector<uint8_t> work_buffer; 183 184 ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); 185 186 TENSOR_DUMP(gf->nodes[0]); 187 188 printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); 189 190 int32_t nelements = sizex*sizey; 191 192 // Set up a the benchmark matrices 193 // printf("Creating new tensor q11 & Running quantize\n"); 194 struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); 195 ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr); 196 197 // Set up a the compute graph 198 // printf("Creating new tensor q31\n"); 199 struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); 200 201 // printf("Creating compute graph\n"); 202 struct ggml_cgraph * gf31 = ggml_new_graph(ctx); 203 ggml_build_forward_expand(gf31, q31); 204 205 // Set up a second graph computation to make sure we override the CPU cache lines 206 // printf("Creating new tensor q12 & Running quantize\n"); 207 struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); 208 ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr); 209 210 // printf("Creating new tensor q32\n"); 211 struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); 212 213 //printf("Creating compute graph\n"); 214 struct ggml_cgraph * gf32 = ggml_new_graph(ctx); 215 ggml_build_forward_expand(gf32, q32); 216 printf("n_threads=%i\n", benchmark_params.n_threads); 217 218 const int dimx = sizex; 219 const int dimy = sizey; 220 const int dimz = sizez; 221 long long int flops_per_dot_product = dimy + dimy; 222 long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; 223 printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); 224 225 226 // Let's use the F32 result from above as a reference for the quantized multiplication 227 float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); 228 229 printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); 230 printf("=====================================================================================\n"); 231 232 double gflops_sum = 0; 233 for (int i=0;i<benchmark_params.n_iterations ;i++) { 234 235 long long int start = ggml_time_us(); 236 //printf("Running ggml_graph_compute\n"); 237 ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads); 238 239 long long int stop = ggml_time_us(); 240 long long int usec = stop-start; 241 double gflops = (double)(flops_per_matrix)/usec/1000.0; 242 gflops_sum += gflops; 243 printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n", 244 i, 245 benchmark_params.n_threads, 246 sizex, sizey, sizez, flops_per_matrix, 247 usec,gflops); 248 249 #ifdef VERBOSE_DEBUGGING 250 TENSOR_DUMP("res",gf31.nodes[0]) 251 #endif 252 253 // Check that the matrix multiplication result is in the right ballpark 254 // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different 255 float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]); 256 float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); 257 float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 258 259 if (delta > allowed_delta) { 260 printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", 261 sum_of_F32_reference, 262 sum_of_Q4_result, 263 delta, 264 allowed_delta 265 ); 266 exit(0); 267 } 268 269 // Running a different graph computation to make sure we override the CPU cache lines 270 ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); 271 } 272 printf("\n"); 273 printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); 274 printf("=====================================================================================\n"); 275 }