Cradicle Explorer

/ tests / test-grad0.cpp
test-grad0.cpp
   1  #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
   2  #include "ggml.h"
   3  
   4  #include <cmath>
   5  #include <cstdio>
   6  #include <cstdlib>
   7  #include <cassert>
   8  
   9  #if defined(_MSC_VER)
  10  #pragma warning(disable: 4244 4267) // possible loss of data
  11  #endif
  12  
  13  #if defined(__GNUC__)
  14  #pragma GCC diagnostic ignored "-Wdouble-promotion"
  15  #endif
  16  
  17  #define MAX_NARGS 3
  18  
  19  #undef MIN
  20  #undef MAX
  21  #define MIN(a, b) ((a) < (b) ? (a) : (b))
  22  #define MAX(a, b) ((a) > (b) ? (a) : (b))
  23  
  24  #define GGML_SILU_FP16
  25  
  26  //
  27  // logging
  28  //
  29  
  30  #if (GGML_DEBUG >= 1)
  31  #define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
  32  #else
  33  #define GGML_PRINT_DEBUG(...)
  34  #endif
  35  
  36  #if (GGML_DEBUG >= 5)
  37  #define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
  38  #else
  39  #define GGML_PRINT_DEBUG_5(...)
  40  #endif
  41  
  42  #if (GGML_DEBUG >= 10)
  43  #define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
  44  #else
  45  #define GGML_PRINT_DEBUG_10(...)
  46  #endif
  47  
  48  #define GGML_PRINT(...) printf(__VA_ARGS__)
  49  
  50  static float frand(void) {
  51      return (float)rand()/(float)RAND_MAX;
  52  }
  53  
  54  static int irand(int n) {
  55      if (n == 0) return 0;
  56      return rand()%n;
  57  }
  58  
  59  static void get_random_dims(int64_t * dims, int ndims) {
  60      dims[0] = dims[1] = dims[2] = dims[3] = 1;
  61  
  62      for (int i = 0; i < ndims; i++) {
  63          dims[i] = 1 + irand(4);
  64      }
  65  }
  66  
  67  static struct ggml_tensor * get_random_tensor_f32(
  68          struct ggml_context * ctx0,
  69          int ndims,
  70          int64_t ne[],
  71          float fmin,
  72          float fmax) {
  73      struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
  74  
  75      switch (ndims) {
  76          case 1:
  77              for (int i0 = 0; i0 < ne[0]; i0++) {
  78                  ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
  79              }
  80              break;
  81          case 2:
  82              for (int i1 = 0; i1 < ne[1]; i1++) {
  83                  for (int i0 = 0; i0 < ne[0]; i0++) {
  84                      ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  85                  }
  86              }
  87              break;
  88          case 3:
  89              for (int i2 = 0; i2 < ne[2]; i2++) {
  90                  for (int i1 = 0; i1 < ne[1]; i1++) {
  91                      for (int i0 = 0; i0 < ne[0]; i0++) {
  92                          ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
  93                      }
  94                  }
  95              }
  96              break;
  97          case 4:
  98              for (int i3 = 0; i3 < ne[3]; i3++) {
  99                  for (int i2 = 0; i2 < ne[2]; i2++) {
 100                      for (int i1 = 0; i1 < ne[1]; i1++) {
 101                          for (int i0 = 0; i0 < ne[0]; i0++) {
 102                              ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
 103                          }
 104                      }
 105                  }
 106              }
 107              break;
 108          default:
 109              assert(false);
 110      }
 111  
 112      return result;
 113  }
 114  
 115  static struct ggml_tensor * get_random_tensor_f16(
 116          struct ggml_context * ctx0,
 117          int ndims,
 118          int64_t ne[],
 119          float fmin,
 120          float fmax) {
 121      struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
 122  
 123      switch (ndims) {
 124          case 1:
 125              for (int i0 = 0; i0 < ne[0]; i0++) {
 126                  ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
 127              }
 128              break;
 129          case 2:
 130              for (int i1 = 0; i1 < ne[1]; i1++) {
 131                  for (int i0 = 0; i0 < ne[0]; i0++) {
 132                      ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
 133                  }
 134              }
 135              break;
 136          case 3:
 137              for (int i2 = 0; i2 < ne[2]; i2++) {
 138                  for (int i1 = 0; i1 < ne[1]; i1++) {
 139                      for (int i0 = 0; i0 < ne[0]; i0++) {
 140                          ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
 141                      }
 142                  }
 143              }
 144              break;
 145          case 4:
 146              for (int i3 = 0; i3 < ne[3]; i3++) {
 147                  for (int i2 = 0; i2 < ne[2]; i2++) {
 148                      for (int i1 = 0; i1 < ne[1]; i1++) {
 149                          for (int i0 = 0; i0 < ne[0]; i0++) {
 150                              ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
 151                          }
 152                      }
 153                  }
 154              }
 155              break;
 156          default:
 157              assert(false);
 158      }
 159  
 160      return result;
 161  }
 162  
 163  static struct ggml_tensor * get_random_tensor_i32(
 164          struct ggml_context * ctx0,
 165          int ndims,
 166          int64_t ne[],
 167          int32_t imin,
 168          int32_t imax) {
 169      struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
 170  
 171      switch (ndims) {
 172          case 1:
 173              for (int i0 = 0; i0 < ne[0]; i0++) {
 174                  ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
 175              }
 176              break;
 177          case 2:
 178              for (int i1 = 0; i1 < ne[1]; i1++) {
 179                  for (int i0 = 0; i0 < ne[0]; i0++) {
 180                      ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
 181                  }
 182              }
 183              break;
 184          case 3:
 185              for (int i2 = 0; i2 < ne[2]; i2++) {
 186                  for (int i1 = 0; i1 < ne[1]; i1++) {
 187                      for (int i0 = 0; i0 < ne[0]; i0++) {
 188                          ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
 189                      }
 190                  }
 191              }
 192              break;
 193          case 4:
 194              for (int i3 = 0; i3 < ne[3]; i3++) {
 195                  for (int i2 = 0; i2 < ne[2]; i2++) {
 196                      for (int i1 = 0; i1 < ne[1]; i1++) {
 197                          for (int i0 = 0; i0 < ne[0]; i0++) {
 198                              ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
 199                          }
 200                      }
 201                  }
 202              }
 203              break;
 204          default:
 205              assert(false);
 206      }
 207  
 208      return result;
 209  }
 210  
 211  static bool check_gradient(
 212          const char * op_name,
 213          struct ggml_context * ctx0,
 214          struct ggml_tensor * x[],
 215          struct ggml_tensor * f,
 216          int ndims,
 217          int nargs,
 218          float eps,
 219          float max_error_abs,
 220          float max_error_rel) {
 221  
 222      static int n_threads = -1;
 223      if (n_threads < 0) {
 224          n_threads = GGML_DEFAULT_N_THREADS;
 225  
 226          const char *env = getenv("GGML_N_THREADS");
 227          if (env) {
 228              n_threads = atoi(env);
 229          }
 230  
 231          printf("GGML_N_THREADS = %d\n", n_threads);
 232      }
 233  
 234      struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
 235      struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
 236      ggml_build_forward_expand(gf, f);
 237      ggml_graph_cpy(gf, gb);
 238      ggml_build_backward_expand(ctx0, gf, gb, false);
 239  
 240      ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 241  
 242      ggml_graph_reset  (gf);
 243      ggml_set_f32      (f->grad, 1.0f);
 244  
 245      ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
 246  
 247      // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
 248      // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
 249  
 250      for (int i = 0; i < nargs; ++i) {
 251          const int nelements = ggml_nelements(x[i]);
 252          for (int k = 0; k < nelements; ++k) {
 253              // compute gradient using finite differences
 254              const float x0 = ggml_get_f32_1d(x[i], k);
 255              const float xm = x0 - eps;
 256              const float xp = x0 + eps;
 257              ggml_set_f32_1d(x[i], k, xp);
 258  
 259              ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 260  
 261              const double f0 = ggml_get_f32_1d(f, 0);
 262  
 263              ggml_set_f32_1d(x[i], k, xm);
 264  
 265              ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
 266  
 267              const double f1 = ggml_get_f32_1d(f, 0);
 268              const double g0 = (f0 - f1)/(2.0*(double) eps);
 269  
 270              ggml_set_f32_1d(x[i], k, x0);
 271  
 272              // compute gradient using backward graph
 273              ggml_graph_reset  (gf);
 274              ggml_set_f32      (f->grad, 1.0f);
 275  
 276              ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
 277  
 278              const double g1 = ggml_get_f32_1d(x[i]->grad, k);
 279  
 280              const double error_abs = fabs(g0 - g1);
 281              const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
 282  
 283              if (error_abs > max_error_abs || error_rel > max_error_rel) {
 284                  printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
 285                              op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
 286                  //assert(false);
 287                  return false;
 288              }
 289          }
 290      }
 291  
 292      return true;
 293  }
 294  
 295  // TODO: clean-up this ..
 296  static bool check_mat_mul(
 297          const struct ggml_tensor * y,
 298          const struct ggml_tensor * x0,
 299          const struct ggml_tensor * x1) {
 300      float * dst  = (float *) y->data;
 301      float * src0 = (float *) x0->data;
 302      float * src1 = (float *) x1->data;
 303  
 304      const int nc = x0->ne[1];
 305      const int nr = x1->ne[1];
 306      const int nk = x0->ne[0];
 307  
 308      GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
 309  
 310      GGML_PRINT_DEBUG("x0:\n");
 311      for (int j = 0; j < x0->ne[1]; ++j) {
 312          for (int i = 0; i < x0->ne[0]; ++i) {
 313              GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
 314          }
 315          GGML_PRINT_DEBUG("\n");
 316      }
 317      GGML_PRINT_DEBUG("\n");
 318  
 319      GGML_PRINT_DEBUG("x1:\n");
 320      for (int j = 0; j < x1->ne[1]; ++j) {
 321          for (int i = 0; i < x1->ne[0]; ++i) {
 322              GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
 323          }
 324          GGML_PRINT_DEBUG("\n");
 325      }
 326      GGML_PRINT_DEBUG("\n");
 327  
 328      GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
 329      for (int j = 0; j < y->ne[1]; ++j) {
 330          for (int i = 0; i < y->ne[0]; ++i) {
 331              GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
 332          }
 333          GGML_PRINT_DEBUG("\n");
 334      }
 335  
 336      for (int i = 0; i < nr; ++i) {
 337          for (int j = 0; j < nc; ++j) {
 338              float sum = 0.0f;
 339  
 340              for (int k = 0; k < nk; ++k) {
 341                  sum += src0[j*nk + k]*src1[i*nk + k];
 342              }
 343  
 344              if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
 345                  fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
 346                  assert(false);
 347                  return false;
 348              }
 349          }
 350      }
 351  
 352      return true;
 353  }
 354  
 355  #define NUM_PERMUTATIONS (4*3*2*1)
 356  
 357  int main(int argc, const char ** argv) {
 358      struct ggml_init_params params = {
 359          /* .mem_size   = */ 256*1024*1024,
 360          /* .mem_buffer = */ NULL,
 361          /* .no_alloc   = */ false,
 362      };
 363  
 364      int64_t ne[4];
 365  
 366      int all_permutations[4 * NUM_PERMUTATIONS];
 367      {
 368          int count = 0;
 369          for (int ax0=0; ax0<4; ++ax0) {
 370              for (int ax1=0; ax1<4; ++ax1) {
 371                  if (ax1 == ax0) continue;
 372                  for (int ax2=0; ax2<4; ++ax2) {
 373                      if (ax2 == ax0) continue;
 374                      if (ax2 == ax1) continue;
 375                      for (int ax3=0; ax3<4; ++ax3) {
 376                          if (ax3 == ax0) continue;
 377                          if (ax3 == ax1) continue;
 378                          if (ax3 == ax2) continue;
 379                          assert(count < NUM_PERMUTATIONS);
 380                          all_permutations[count*4+0] = ax0;
 381                          all_permutations[count*4+1] = ax1;
 382                          all_permutations[count*4+2] = ax2;
 383                          all_permutations[count*4+3] = ax3;
 384                          ++count;
 385                      }
 386                  }
 387              }
 388          }
 389      }
 390  
 391      unsigned seed_iter = 1;
 392  
 393      // original loop: 1000
 394      int niter = 4;
 395      const char *env = getenv("GGML_NLOOP");
 396      if (env != NULL) {
 397          niter = atoi(env);
 398      }
 399      if (argc > 1) {
 400          niter = atoi(argv[1]);
 401      }
 402      for (int iter = 0; iter < niter; ++iter) {
 403          srand(seed_iter);
 404          seed_iter = rand();
 405          unsigned seed = rand();
 406  
 407          printf("test-grad0: iter:%d/%d\n", iter, niter);
 408          struct ggml_context * ctx0 = ggml_init(params);
 409  
 410          get_random_dims(ne, 4);
 411  
 412          struct ggml_tensor * x[MAX_NARGS];
 413  
 414          // add f32
 415          {
 416              srand(seed);
 417              const int nargs = 2;
 418  
 419              for (int ndims = 1; ndims <= 4; ++ndims) {
 420                  for (int i = 0; i < nargs; ++i) {
 421                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 422                      ggml_set_param(ctx0, x[i]);
 423                  }
 424  
 425                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
 426  
 427                  check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
 428              }
 429          }
 430  
 431          // add f16
 432          {
 433              srand(seed);
 434              const int nargs = 2;
 435  
 436              for (int ndims = 1; ndims <= 4; ++ndims) {
 437                  for (int i = 0; i < nargs; ++i) {
 438                      x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
 439                      ggml_set_param(ctx0, x[i]);
 440                  }
 441  
 442                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
 443  
 444                  check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
 445              }
 446          }
 447  
 448          // sub
 449          {
 450              srand(seed);
 451              const int nargs = 2;
 452  
 453              for (int ndims = 1; ndims <= 4; ++ndims) {
 454                  for (int i = 0; i < nargs; ++i) {
 455                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 456                      ggml_set_param(ctx0, x[i]);
 457                  }
 458  
 459                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
 460  
 461                  check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 462              }
 463          }
 464  
 465          // mul
 466          {
 467              srand(seed);
 468              const int nargs = 2;
 469  
 470              for (int ndims = 1; ndims <= 4; ++ndims) {
 471                  for (int i = 0; i < nargs; ++i) {
 472                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 473                      ggml_set_param(ctx0, x[i]);
 474                  }
 475  
 476                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
 477  
 478                  check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 479              }
 480          }
 481  
 482          // div
 483          {
 484              srand(seed);
 485              const int nargs = 2;
 486  
 487              for (int ndims = 1; ndims <= 4; ++ndims) {
 488                  for (int i = 0; i < nargs; ++i) {
 489                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
 490                      ggml_set_param(ctx0, x[i]);
 491                  }
 492  
 493                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
 494  
 495                  check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
 496              }
 497          }
 498  
 499          // sqr
 500          {
 501              srand(seed);
 502              const int nargs = 1;
 503  
 504              for (int ndims = 1; ndims <= 2; ++ndims) {
 505                  for (int i = 0; i < nargs; ++i) {
 506                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 507                      ggml_set_param(ctx0, x[i]);
 508                  }
 509  
 510                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
 511  
 512                  check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 513              }
 514          }
 515  
 516          // sqrt
 517          {
 518              srand(seed);
 519              const int nargs = 1;
 520  
 521              for (int ndims = 1; ndims <= 2; ++ndims) {
 522                  for (int i = 0; i < nargs; ++i) {
 523                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
 524                      ggml_set_param(ctx0, x[i]);
 525                  }
 526  
 527                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
 528  
 529                  check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
 530              }
 531          }
 532  
 533          // log
 534          {
 535              srand(seed);
 536              const int nargs = 1;
 537  
 538              for (int ndims = 1; ndims <= 2; ++ndims) {
 539                  for (int i = 0; i < nargs; ++i) {
 540                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
 541                      ggml_set_param(ctx0, x[i]);
 542                  }
 543  
 544                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
 545  
 546                  check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
 547              }
 548          }
 549  
 550          // sum
 551          {
 552              srand(seed);
 553              const int nargs = 1;
 554  
 555              for (int ndims = 1; ndims <= 2; ++ndims) {
 556                  for (int i = 0; i < nargs; ++i) {
 557                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 558                      ggml_set_param(ctx0, x[i]);
 559                  }
 560  
 561                  struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
 562  
 563                  check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 564              }
 565          }
 566  
 567  
 568          // sum_rows
 569          {
 570              srand(seed);
 571              const int nargs = 1;
 572  
 573              for (int ndims = 1; ndims <= 4; ++ndims) {
 574                  for (int i = 0; i < nargs; ++i) {
 575                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 576                      ggml_set_param(ctx0, x[i]);
 577                  }
 578  
 579                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
 580  
 581                  check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
 582              }
 583          }
 584  
 585          // mean, not yet fully implemented
 586          if(0)
 587          {
 588              srand(seed);
 589              const int nargs = 1;
 590  
 591              for (int ndims = 1; ndims <= 4; ++ndims) {
 592                  for (int i = 0; i < nargs; ++i) {
 593                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 594                      ggml_set_param(ctx0, x[i]);
 595                  }
 596  
 597                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
 598  
 599                  check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 600              }
 601          }
 602  
 603          // argmax
 604          if (0)
 605          {
 606              srand(seed);
 607              const int nargs = 1;
 608  
 609              for (int ndims = 1; ndims <= 4; ++ndims) {
 610                  for (int i = 0; i < nargs; ++i) {
 611                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 612                      ggml_set_param(ctx0, x[i]);
 613                  }
 614  
 615                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
 616  
 617                  check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 618              }
 619          }
 620  
 621          // repeat
 622          {
 623              srand(seed);
 624              int64_t ne2[4];
 625              get_random_dims(ne2, 4);
 626  
 627              ne2[0] = ne[0] * ne2[0];
 628              ne2[1] = ne[1] * ne2[1];
 629              ne2[2] = 1;
 630              ne2[3] = 1;
 631  
 632              const int nargs = 1;
 633              for (int ndims = 1; ndims <= 2; ++ndims) {
 634                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 635                  x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
 636                  ggml_set_param(ctx0, x[0]);
 637  
 638                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
 639  
 640                  check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
 641              }
 642          }
 643  
 644          // repeat back
 645          {
 646              srand(seed);
 647              int64_t ne2[4];
 648              get_random_dims(ne2, 4);
 649  
 650              ne2[0] = ne[0] * ne2[0];
 651              ne2[1] = ne[1] * ne2[1];
 652              ne2[2] = 1;
 653              ne2[3] = 1;
 654  
 655              const int nargs = 1;
 656              for (int ndims = 1; ndims <= 2; ++ndims) {
 657                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 658                  x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
 659                  ggml_set_param(ctx0, x[0]);
 660  
 661                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
 662  
 663                  check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
 664              }
 665          }
 666  
 667          // abs (finite differences do not work)
 668          //{
 669          //    const int nargs = 1;
 670  
 671          //    for (int ndims = 1; ndims <= 2; ++ndims) {
 672          //        for (int i = 0; i < nargs; ++i) {
 673          //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 674          //            ggml_set_param(ctx0, x[i]);
 675          //        }
 676  
 677          //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
 678  
 679          //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
 680          //    }
 681          //}
 682  
 683          // sgn
 684          {
 685              srand(seed);
 686              const int nargs = 1;
 687  
 688              for (int ndims = 1; ndims <= 4; ++ndims) {
 689                  for (int i = 0; i < nargs; ++i) {
 690                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 691                      ggml_set_param(ctx0, x[i]);
 692                  }
 693  
 694                  struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
 695  
 696                  check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 697              }
 698          }
 699  
 700          // neg
 701          {
 702              srand(seed);
 703              const int nargs = 1;
 704  
 705              for (int ndims = 1; ndims <= 4; ++ndims) {
 706                  for (int i = 0; i < nargs; ++i) {
 707                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 708                      ggml_set_param(ctx0, x[i]);
 709                  }
 710  
 711                  struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
 712  
 713                  check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 714              }
 715          }
 716  
 717          // step
 718          {
 719              srand(seed);
 720              const int nargs = 1;
 721  
 722              for (int ndims = 1; ndims <= 4; ++ndims) {
 723                  for (int i = 0; i < nargs; ++i) {
 724                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 725                      ggml_set_param(ctx0, x[i]);
 726                  }
 727  
 728                  struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
 729  
 730                  check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 731              }
 732          }
 733  
 734          // tanh, not yet fully implemented
 735          if(0)
 736          {
 737              srand(seed);
 738              const int nargs = 1;
 739  
 740              for (int ndims = 1; ndims <= 4; ++ndims) {
 741                  for (int i = 0; i < nargs; ++i) {
 742                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 743                      ggml_set_param(ctx0, x[i]);
 744                  }
 745  
 746                  struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
 747  
 748                  check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 749              }
 750          }
 751  
 752          // mul_mat
 753          {
 754              srand(seed);
 755              const int nargs = 2;
 756  
 757              for (int ndims = 2; ndims <= 4; ++ndims) {
 758                  int max_nrep = (ndims >= 3) ? 2 : 1;
 759                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 760                  for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
 761                      for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
 762                          {
 763                              int64_t ne2[4];
 764                              get_random_dims(ne2, 4);
 765                              ne2[0] = ne[0];
 766                              ne2[2] = nrep2 * ne[2];
 767                              ne2[3] = nrep3 * ne[3];
 768                              x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
 769                          }
 770  
 771                          ggml_set_param(ctx0, x[0]);
 772                          ggml_set_param(ctx0, x[1]);
 773  
 774                          struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
 775                          struct ggml_tensor * f = ggml_sum(ctx0, m);
 776  
 777                          GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
 778  
 779                          check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 780                          if (ndims == 2) {
 781                              // check_mat_mul does not support ndims > 2
 782                              check_mat_mul(m, x[1], x[0]);
 783                          }
 784                      }
 785                  }
 786              }
 787          }
 788  
 789          // elu, not yet fully implemented
 790          if(0)
 791          {
 792              srand(seed);
 793              const int nargs = 1;
 794  
 795              for (int ndims = 1; ndims <= 4; ++ndims) {
 796                  for (int i = 0; i < nargs; ++i) {
 797                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 798                      ggml_set_param(ctx0, x[i]);
 799                  }
 800  
 801                  struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
 802  
 803                  check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 804              }
 805          }
 806  
 807          // relu
 808          {
 809              srand(seed);
 810              const int nargs = 1;
 811  
 812              for (int ndims = 1; ndims <= 4; ++ndims) {
 813                  for (int i = 0; i < nargs; ++i) {
 814                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 815                      ggml_set_param(ctx0, x[i]);
 816                  }
 817  
 818                  struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
 819  
 820                  check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 821              }
 822          }
 823  
 824          // gelu, not yet fully implemented
 825          if(0)
 826          {
 827              srand(seed);
 828              const int nargs = 1;
 829  
 830              for (int ndims = 1; ndims <= 4; ++ndims) {
 831                  for (int i = 0; i < nargs; ++i) {
 832                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 833                      ggml_set_param(ctx0, x[i]);
 834                  }
 835  
 836                  struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
 837  
 838                  check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
 839              }
 840          }
 841  
 842          // silu
 843          {
 844              srand(seed);
 845              const int nargs = 1;
 846  
 847              for (int ndims = 1; ndims <= 2; ++ndims) {
 848                  for (int i = 0; i < nargs; ++i) {
 849                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 850                      ggml_set_param(ctx0, x[i]);
 851                  }
 852  
 853                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
 854  
 855  #ifdef GGML_SILU_FP16
 856                  // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
 857                  check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
 858  #else
 859                  check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 860  #endif
 861              }
 862          }
 863  
 864          // rms_norm
 865          {
 866              srand(seed);
 867              const int nargs = 1;
 868  
 869              for (int ndims = 1; ndims <= 2; ++ndims) {
 870                  for (int i = 0; i < nargs; ++i) {
 871                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 872                      ggml_set_param(ctx0, x[i]);
 873                  }
 874  
 875                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
 876  
 877                  check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
 878              }
 879          }
 880  
 881          // scale
 882          {
 883              srand(seed);
 884              const int nargs = 1;
 885  
 886              for (int ndims = 1; ndims <= 2; ++ndims) {
 887                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 888  
 889                  const float s = -1.0f + 2.0f*frand();
 890  
 891                  ggml_set_param(ctx0, x[0]);
 892  
 893                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
 894  
 895                  check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 896              }
 897          }
 898  
 899          // cpy f32
 900          {
 901              srand(seed);
 902              const int nargs = 2;
 903  
 904              for (int ndims = 1; ndims <= 2; ++ndims) {
 905                  for (int i = 0; i < nargs; ++i) {
 906                      x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 907                      ggml_set_param(ctx0, x[i]);
 908                  }
 909                  // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
 910  
 911                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
 912  
 913                  check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 914              }
 915          }
 916  
 917          // cpy f16
 918          {
 919              srand(seed);
 920              const int nargs = 2;
 921  
 922              for (int ndims = 1; ndims <= 2; ++ndims) {
 923                  for (int i = 0; i < nargs; ++i) {
 924                      x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
 925                      ggml_set_param(ctx0, x[i]);
 926                  }
 927                  // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
 928  
 929                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
 930  
 931                  check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
 932              }
 933          }
 934  
 935          // reshape (1d->nd)
 936          {
 937              srand(seed);
 938              const int nargs = 1;
 939  
 940              for (int ndims = 1; ndims <= 2; ++ndims) {
 941                  int64_t ne2[4];
 942                  ne2[0] = 1;
 943                  ne2[1] = 1;
 944                  ne2[2] = 1;
 945                  ne2[3] = 1;
 946                  for (int i = 0; i < ndims; ++i) {
 947                      ne2[0] *= ne[i];
 948                  }
 949                  x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
 950                  x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 951                  ggml_set_param(ctx0, x[0]);
 952  
 953  
 954                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
 955                  check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 956              }
 957          }
 958  
 959          // reshape (nd->1d)
 960          {
 961              srand(seed);
 962              const int nargs = 1;
 963  
 964              for (int ndims = 1; ndims <= 2; ++ndims) {
 965                  int64_t ne2[4];
 966                  ne2[0] = 1;
 967                  ne2[1] = 1;
 968                  ne2[2] = 1;
 969                  ne2[3] = 1;
 970                  for (int i = 0; i < ndims; ++i) {
 971                      ne2[0] *= ne[i];
 972                  }
 973                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 974                  x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
 975                  ggml_set_param(ctx0, x[0]);
 976  
 977  
 978                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
 979                  check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
 980              }
 981          }
 982  
 983          // acc 1d
 984          {
 985              srand(seed);
 986              int64_t ne2[4] = { 1, 1, 1, 1 };
 987  
 988              const int nargs = 2;
 989              for (int ndims = 1; ndims <= 4; ++ndims) {
 990  
 991                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 992                  ggml_set_param(ctx0, x[0]);
 993  
 994                  get_random_dims(ne2, 1);
 995                  while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
 996                      get_random_dims(ne2, 1);
 997                  }
 998  
 999                  x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
1000                  ggml_set_param(ctx0, x[1]);
1001  
1002                  const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
1003                  const int offset = irand(max_offset) * ggml_element_size(x[0]);
1004  
1005                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
1006  
1007                  check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1008              }
1009          }
1010  
1011          // acc 2d
1012          {
1013              srand(seed);
1014              int64_t ne2[4]         = { 1, 1, 1, 1 };
1015              int64_t max_offsets[4] = { 0, 0, 0, 0 };
1016              int64_t offsets[4]     = { 0, 0, 0, 0 };
1017  
1018              const int nargs = 2;
1019              for (int ndims = 2; ndims <= 4; ++ndims) {
1020  
1021                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1022                  ggml_set_param(ctx0, x[0]);
1023  
1024                  get_random_dims(ne2, 2);
1025                  while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
1026                      get_random_dims(ne2, 2);
1027                  }
1028  
1029                  x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
1030                  ggml_set_param(ctx0, x[1]);
1031  
1032                  max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
1033                  max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
1034                  offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
1035                  offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
1036                  const int offset = offsets[0] + offsets[1];
1037  
1038                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
1039  
1040                  check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1041              }
1042          }
1043  
1044          // acc 3d
1045          {
1046              srand(seed);
1047              int64_t ne2[4]         = { 1, 1, 1, 1 };
1048              int64_t max_offsets[4] = { 0, 0, 0, 0 };
1049              int64_t offsets[4]     = { 0, 0, 0, 0 };
1050  
1051              const int nargs = 2;
1052              for (int ndims = 3; ndims <= 4; ++ndims) {
1053  
1054                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1055                  ggml_set_param(ctx0, x[0]);
1056  
1057                  get_random_dims(ne2, 3);
1058                  while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
1059                      get_random_dims(ne2, 3);
1060                  }
1061  
1062                  x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
1063                  ggml_set_param(ctx0, x[1]);
1064  
1065                  max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
1066                  max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
1067                  max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
1068                  offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
1069                  offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
1070                  offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
1071                  const int offset = offsets[0] + offsets[1] + offsets[2];
1072  
1073                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
1074  
1075                  check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1076              }
1077          }
1078  
1079          // acc 4d
1080          {
1081              srand(seed);
1082              int64_t ne2[4]         = { 1, 1, 1, 1 };
1083              int64_t max_offsets[4] = { 0, 0, 0, 0 };
1084              int64_t offsets[4]     = { 0, 0, 0, 0 };
1085  
1086              const int nargs = 2;
1087              for (int ndims = 4; ndims <= 4; ++ndims) {
1088  
1089                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1090                  ggml_set_param(ctx0, x[0]);
1091  
1092                  get_random_dims(ne2, 4);
1093                  while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
1094                      get_random_dims(ne2, 4);
1095                  }
1096  
1097                  x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
1098                  ggml_set_param(ctx0, x[1]);
1099  
1100                  max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
1101                  max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
1102                  max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
1103                  max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
1104                  offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
1105                  offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
1106                  offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
1107                  offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
1108                  const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
1109  
1110                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
1111  
1112                  check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1113              }
1114          }
1115  
1116          // set_1d
1117          {
1118              srand(seed);
1119              int64_t ne2[4];
1120  
1121              const int nargs = 2;
1122              for (int ndims = 1; ndims <= 4; ++ndims) {
1123  
1124                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1125                  ggml_set_param(ctx0, x[0]);
1126  
1127                  get_random_dims(ne2, 1);
1128                  while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
1129                      get_random_dims(ne2, 1);
1130                  }
1131  
1132                  x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
1133                  ggml_set_param(ctx0, x[1]);
1134  
1135                  const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
1136                  const int offset = irand(max_offset) * ggml_element_size(x[0]);
1137  
1138                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
1139  
1140                  check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1141              }
1142          }
1143  
1144          // set_2d
1145          {
1146              srand(seed);
1147              int64_t ne2[4];
1148              int64_t max_offsets[4] = { 0, 0, 0, 0 };
1149              int64_t offsets[4]     = { 0, 0, 0, 0 };
1150  
1151              const int nargs = 1;
1152              for (int ndims = 2; ndims <= 4; ++ndims) {
1153  
1154                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1155                  ggml_set_param(ctx0, x[0]);
1156  
1157                  get_random_dims(ne2, 2);
1158                  while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
1159                      get_random_dims(ne2, 2);
1160                  }
1161  
1162                  x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
1163                  ggml_set_param(ctx0, x[1]);
1164  
1165                  max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
1166                  max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
1167                  offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
1168                  offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
1169                  const int offset = offsets[0] + offsets[1];
1170  
1171                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
1172  
1173                  check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1174              }
1175          }
1176  
1177          // view_1d
1178          {
1179              srand(seed);
1180              const int nargs = 1;
1181              for (int ndims = 1; ndims <= 4; ++ndims) {
1182  
1183                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1184  
1185                  ggml_set_param(ctx0, x[0]);
1186  
1187                  const int k0 = irand(ggml_nelements(x[0]));
1188                  const int k1 = irand(ggml_nelements(x[0]));
1189                  const int i0 = MIN(k0, k1);
1190                  const int i1 = MAX(k0, k1);
1191  
1192                  const int offset = i0 * sizeof(float);
1193                  const int nelem  = i1 - i0;
1194  
1195                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
1196  
1197                  check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1198              }
1199          }
1200  
1201          // view_2d
1202          {
1203              srand(seed);
1204              int64_t ne2[4];
1205              int64_t nb2[4];
1206  
1207              const int nargs = 1;
1208              for (int ndims = 1; ndims <= 4; ++ndims) {
1209  
1210                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1211  
1212                  get_random_dims(ne2, 2);
1213                  while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
1214                      get_random_dims(ne2, 2);
1215                  }
1216                  const int count = ne2[0]*ne2[1];
1217  
1218                  nb2[0] = sizeof(float);
1219                  nb2[1] = nb2[0]*ne2[0];
1220  
1221                  ggml_set_param(ctx0, x[0]);
1222  
1223                  const int max_offset = ggml_nelements(x[0]) - count;
1224                  const int offset = irand(max_offset+1) * sizeof(float);
1225  
1226                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
1227  
1228                  check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1229              }
1230          }
1231  
1232          // view_3d
1233          {
1234              srand(seed);
1235              int64_t ne2[4] = {1,1,1,1};
1236              int64_t nb2[4] = {0,0,0,0};
1237  
1238              const int nargs = 1;
1239              for (int ndims = 1; ndims <= 4; ++ndims) {
1240  
1241                  x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1242  
1243                  get_random_dims(ne2, 3);
1244                  while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
1245                      get_random_dims(ne2, 3);
1246                  }
1247                  const int count = ne2[0]*ne2[1]*ne2[2];
1248  
1249                  nb2[0] = sizeof(float);
1250                  nb2[1] = nb2[0]*ne2[0];
1251                  nb2[2] = nb2[1]*ne2[1];
1252  
1253                  ggml_set_param(ctx0, x[0]);
1254  
1255                  const int max_offset = ggml_nelements(x[0]) - count;
1256                  const int offset = irand(max_offset+1) * sizeof(float);
1257  
1258                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
1259  
1260                  check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1261              }
1262          }
1263  
1264          // permute
1265          {
1266              srand(seed);
1267              int64_t ne2[4];
1268  
1269              const int nargs = 1;
1270              for (int ndims = 1; ndims <= 4; ++ndims)
1271              {
1272                  // ggml_permute will set axes of dimensions below n_dims to 1.
1273                  // to make ggml_permute work correctly on all axes,
1274                  // the input tensor needs maximal n_dim of 4.
1275                  for (int i=0; i<ndims; ++i) {
1276                      ne2[i] = ne[i];
1277                  }
1278                  for (int i=ndims; i<4; ++i) {
1279                      ne2[i] = 1;
1280                  }
1281                  x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
1282  
1283                  ggml_set_param(ctx0, x[0]);
1284  
1285                  const int p = irand(NUM_PERMUTATIONS);
1286                  const int ax0 = all_permutations[p*4+0];
1287                  const int ax1 = all_permutations[p*4+1];
1288                  const int ax2 = all_permutations[p*4+2];
1289                  const int ax3 = all_permutations[p*4+3];
1290  
1291                  // sum requires contiguous tensor rows
1292                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
1293  
1294                  check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1295              }
1296          }
1297  
1298          // transpose
1299          {
1300              srand(seed);
1301              int64_t ne2[4];
1302  
1303              const int nargs = 1;
1304              for (int ndims = 1; ndims <= 4; ++ndims)
1305              {
1306                  // ggml_transpose will set axes of dimensions below n_dims to 1.
1307                  // to make ggml_transpose work correctly on all axes,
1308                  // the input tensor needs maximal n_dim of 4.
1309                  for (int i=0; i<ndims; ++i) {
1310                      ne2[i] = ne[i];
1311                  }
1312                  for (int i=ndims; i<4; ++i) {
1313                      ne2[i] = 1;
1314                  }
1315                  x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
1316  
1317                  ggml_set_param(ctx0, x[0]);
1318  
1319                  // sum requires contiguous tensor rows
1320                  struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
1321  
1322                  check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1323              }
1324          }
1325  
1326          // get_rows
1327          {
1328              srand(seed);
1329              int64_t ne2[4] = {ne[0], ne[1], 1, 1};
1330              int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
1331              const int nargs = 1;
1332              const int ndims = 2;
1333              x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
1334              x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
1335  
1336              ggml_set_param(ctx0, x[0]);
1337  
1338              struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
1339  
1340              check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1341          }
1342  
1343          // diag_mask_inf
1344          {
1345              srand(seed);
1346              const int nargs = 1;
1347              const int ndims = 2;
1348  
1349              x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1350              ggml_set_param(ctx0, x[0]);
1351  
1352              int n_past = irand(ne[0]);
1353  
1354              struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
1355  
1356              check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1357          }
1358  
1359          // diag_mask_zero
1360          {
1361              srand(seed);
1362              const int nargs = 1;
1363              const int ndims = 2;
1364  
1365              x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
1366              ggml_set_param(ctx0, x[0]);
1367  
1368              int n_past = irand(ne[0]);
1369  
1370              struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
1371  
1372              check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
1373          }
1374  
1375          // softmax
1376          {
1377              srand(seed);
1378              const int nargs = 1;
1379  
1380              int64_t ne2[4];
1381              get_random_dims(ne2, 4);
1382  
1383              for (int ndims = 1; ndims <= 3; ++ndims) {
1384                  x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
1385                  ggml_set_param(ctx0, x[0]);
1386  
1387                  float eps = 1e-6f;
1388                  // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
1389                  // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
1390                  struct ggml_tensor * f = ggml_sum(ctx0,
1391                                              ggml_log(ctx0,
1392                                                  ggml_add1(ctx0,
1393                                                      ggml_scale(ctx0,
1394                                                          ggml_soft_max(ctx0, x[0]),
1395                                                          1.0f - eps),
1396                                                      ggml_new_f32(ctx0, eps))));
1397  
1398                  check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
1399                  // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
1400                  // this may result in different gradients too finite differences.
1401                  // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
1402                  // if only the table lookup causes gradients to differ this is acceptable.
1403              }
1404          }
1405  
1406          // cross_entropy_loss
1407          {
1408              srand(seed);
1409              const int nargs = 1;
1410  
1411              int64_t ne2[4];
1412              get_random_dims(ne2, 4);
1413  
1414              for (int ndims = 1; ndims <= 4; ++ndims) {
1415                  x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
1416                  x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
1417                  // the second argument to cross_entropy_loss must sum up to 1 for each row
1418                  int nr = ggml_nrows(x[1]);
1419                  int nc = ggml_nelements(x[1]) / nr;
1420                  for (int ir = 0; ir < nr; ++ir) {
1421                      float sum = 0;
1422                      for (int ic = 0; ic < nc; ++ic) {
1423                          sum += ((float *) x[1]->data)[ic + ir*nc];
1424                      }
1425                      for (int ic = 0; ic < nc; ++ic) {
1426                          ((float *) x[1]->data)[ic + ir*nc] /= sum;
1427                      }
1428                  }
1429                  ggml_set_param(ctx0, x[0]);
1430  
1431                  struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
1432  
1433                  check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
1434              }
1435          }
1436  
1437          // rope f32
1438          {
1439              srand(seed);
1440              const int nargs = 1;
1441  
1442              int64_t ne2[4];
1443              get_random_dims(ne2, 4);
1444              ne2[0] += ne2[0] % 2;
1445              int n_rot = ne2[0];
1446  
1447              for (int ndims = 3; ndims <= 4; ++ndims) {
1448                  for (int mode = 0; mode < 4; ++mode) {
1449                      for (int n_past = 1; n_past < ne2[2]; ++n_past) {
1450                          x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
1451  
1452                          struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
1453                          for (int i = 0; i < ne2[2]; ++i) {
1454                              ((int32_t *) p->data)[i] = n_past + i;
1455                          }
1456  
1457                          ggml_set_param(ctx0, x[0]);
1458  
1459                          const bool skip_past = (mode & 1);
1460                          if (skip_past) {
1461                              // we have no past, so this would have to work on uninitialized memory.
1462                              // we only test the gradients here;
1463                              // skip_past should have no influence on gradient computation.
1464                              // so when other modes work, we assume that this does as well.
1465                              continue;
1466                          }
1467  
1468                          struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
1469  
1470                          GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
1471                          check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
1472                      }
1473                  }
1474              }
1475          }
1476  
1477          // rope f16
1478          {
1479              srand(seed);
1480              const int nargs = 1;
1481  
1482              int64_t ne2[4];
1483              get_random_dims(ne2, 4);
1484              ne2[0] += ne2[0] % 2;
1485              int n_rot = ne2[0];
1486  
1487              for (int ndims = 3; ndims <= 4; ++ndims) {
1488                  for (int mode = 0; mode < 4; ++mode) {
1489                      for (int n_past = 1; n_past < ne2[2]; ++n_past) {
1490                          x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
1491  
1492                          struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
1493                          for (int i = 0; i < ne2[2]; ++i) {
1494                              ((int32_t *) p->data)[i] = n_past + i;
1495                          }
1496  
1497                          ggml_set_param(ctx0, x[0]);
1498  
1499                          const bool skip_past = (mode & 1);
1500                          if (skip_past) {
1501                              // we have no past, so this would have to work on uninitialized memory.
1502                              // we only test the gradients here;
1503                              // skip_past should have no influence on gradient computation.
1504                              // so when other modes work, we assume that this does as well.
1505                              continue;
1506                          }
1507  
1508                          struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
1509  
1510                          GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
1511                          check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
1512                      }
1513                  }
1514              }
1515          }
1516  
1517          // flash_attn f32
1518          // TODO: adapt to ggml_flash_attn_ext() changes
1519          //{
1520          //    srand(seed);
1521          //    const int nargs = 3;
1522  
1523          //    int64_t ne2[4];
1524  
1525          //    get_random_dims(ne2, 4);
1526          //    int64_t D = ne2[0];
1527          //    int64_t N = ne2[1];
1528          //    int64_t M = ne2[2] + N;
1529          //    int64_t B = ne2[3];
1530  
1531          //    for (int masked = 0; masked <= 1; ++masked) {
1532          //        for (int ndims = 2; ndims <= 4; ++ndims) {
1533          //            int max_nrep = (ndims >= 3) ? 2 : 1;
1534          //            for (int nrep = 1; nrep < max_nrep; ++nrep) {
1535          //                int64_t neq[4] = { D, N, B*nrep, ne[3] };
1536          //                int64_t nek[4] = { D, M, B, ne[3] };
1537          //                int64_t nev[4] = { M, D, B, ne[3] };
1538          //                if (ndims == 2) {
1539          //                    neq[2] = 1; neq[3] = 1;
1540          //                    nek[2] = 1; nek[3] = 1;
1541          //                    nev[2] = 1; nev[3] = 1;
1542          //                } else if (ndims == 3) {
1543          //                    neq[3] = 1;
1544          //                    nek[3] = 1;
1545          //                    nev[3] = 1;
1546          //                }
1547          //                x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
1548          //                x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
1549          //                x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
1550          //                ggml_set_param(ctx0, x[0]);
1551          //                ggml_set_param(ctx0, x[1]);
1552          //                ggml_set_param(ctx0, x[2]);
1553  
1554          //                struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
1555  
1556          //                check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
1557          //            }
1558          //        }
1559          //    }
1560          //}
1561  
1562          ggml_free(ctx0);
1563      }
1564  
1565      return 0;
1566  }