test-opt.cpp
1 #include "ggml.h" 2 3 #include <cmath> 4 #include <cstdio> 5 #include <cstdlib> 6 #include <cassert> 7 8 #define MAX_NARGS 2 9 10 #if defined(__GNUC__) 11 #pragma GCC diagnostic ignored "-Wdouble-promotion" 12 #endif 13 14 // 15 // logging 16 // 17 #define GGML_DEBUG 0 18 #if (GGML_DEBUG >= 1) 19 #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) 20 #else 21 #define GGML_PRINT_DEBUG(...) 22 #endif 23 24 #if (GGML_DEBUG >= 5) 25 #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) 26 #else 27 #define GGML_PRINT_DEBUG_5(...) 28 #endif 29 30 #if (GGML_DEBUG >= 10) 31 #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) 32 #else 33 #define GGML_PRINT_DEBUG_10(...) 34 #endif 35 36 #define GGML_PRINT(...) printf(__VA_ARGS__) 37 38 39 static float frand(void) { 40 return (float)rand()/(float)RAND_MAX; 41 } 42 43 static struct ggml_tensor * get_random_tensor( 44 struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax 45 ) { 46 struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); 47 48 switch (ndims) { 49 case 1: 50 for (int i0 = 0; i0 < ne[0]; i0++) { 51 ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin; 52 } 53 break; 54 case 2: 55 for (int i1 = 0; i1 < ne[1]; i1++) { 56 for (int i0 = 0; i0 < ne[0]; i0++) { 57 ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; 58 } 59 } 60 break; 61 case 3: 62 for (int i2 = 0; i2 < ne[2]; i2++) { 63 for (int i1 = 0; i1 < ne[1]; i1++) { 64 for (int i0 = 0; i0 < ne[0]; i0++) { 65 ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; 66 } 67 } 68 } 69 break; 70 case 4: 71 for (int i3 = 0; i3 < ne[3]; i3++) { 72 for (int i2 = 0; i2 < ne[2]; i2++) { 73 for (int i1 = 0; i1 < ne[1]; i1++) { 74 for (int i0 = 0; i0 < ne[0]; i0++) { 75 ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin; 76 } 77 } 78 } 79 } 80 break; 81 default: 82 assert(false); 83 } 84 85 return result; 86 } 87 88 int main(void) { 89 struct ggml_init_params params = { 90 /* .mem_size = */ 1024*1024*1024, 91 /* .mem_buffer = */ NULL, 92 /* .no_alloc = */ false, 93 }; 94 95 struct ggml_context * ctx = ggml_init(params); 96 97 int64_t ne1[4] = {4, 128, 1, 1}; 98 int64_t ne2[4] = {4, 256, 1, 1}; 99 int64_t ne3[4] = {128, 256, 1, 1}; 100 101 struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1); 102 struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1); 103 ggml_set_param(ctx, a); 104 ggml_set_param(ctx, b); 105 106 struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1); 107 108 struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b); 109 struct ggml_tensor * d = ggml_sub(ctx, c, ab); 110 struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d)); 111 112 struct ggml_cgraph * ge = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true); 113 ggml_build_forward_expand(ge, e); 114 ggml_graph_reset(ge); 115 116 ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1); 117 118 const float fe = ggml_get_f32_1d(e, 0); 119 printf("%s: e = %.4f\n", __func__, fe); 120 121 struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); 122 123 ggml_opt(ctx, opt_params, e); 124 125 ggml_graph_reset(ge); 126 127 ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1); 128 129 const float fe_opt = ggml_get_f32_1d(e, 0); 130 printf("%s: original e = %.4f\n", __func__, fe); 131 printf("%s: optimized e = %.4f\n", __func__, fe_opt); 132 133 const bool success = (fe_opt <= fe); 134 assert(success); 135 136 ggml_free(ctx); 137 return success ? 0 : -1; 138 } 139 // int64_t ne1[4] = {4, 128, 1, 1}; 140 // int64_t ne2[4] = {4, 256, 1, 1};; 141 // int64_t ne3[4] = {128, 256, 1, 1}; 142 // main: original e = 25890.9375 143 // main: optimized e = 10094.7031 144 145 // int64_t ne1[4] = {8, 128, 1, 1}; 146 // int64_t ne2[4] = {8, 256, 1, 1};; 147 // int64_t ne3[4] = {128, 256, 1, 1}; 148 // main: original e = 39429.5078 149 // main: optimized e = 9275.8936 150 151 // int64_t ne1[4] = {16, 128, 1, 1}; 152 // int64_t ne2[4] = {16, 256, 1, 1};; 153 // int64_t ne3[4] = {128, 256, 1, 1}; 154 // main: original e = 68371.1328 155 // main: optimized e = 7854.4502 156 157 158 // int64_t ne1[4] = {32, 128, 1, 1}; 159 // int64_t ne2[4] = {32, 256, 1, 1};; 160 // int64_t ne3[4] = {128, 256, 1, 1}; 161 // main: original e = 126061.1953 162 // main: optimized e = 5451.0166 163 164 // int64_t ne1[4] = {4, 1024, 1, 1}; 165 // int64_t ne2[4] = {4, 2048, 1, 1};; 166 // int64_t ne3[4] = {1024, 2048, 1, 1}; 167 // main: original e = 1620817.8750 168 // main: optimized e = 698387.6875 169 170 // another run on M1 171 // int64_t ne1[4] = {4, 1024, 1, 1}; 172 // int64_t ne2[4] = {4, 2048, 1, 1};; 173 // int64_t ne3[4] = {1024, 2048, 1, 1}; 174 // main: original e = 1629595.6250 175 // main: optimized e = 698169.1250 176 177 // int64_t ne1[4] = {32, 1024, 1, 1}; 178 // int64_t ne2[4] = {32, 2048, 1, 1};; 179 // int64_t ne3[4] = {1024, 2048, 1, 1}; 180 // main: original e = 8146770.5000 181 // main: optimized e = 651119.1250