/ nn.h
nn.h
  1  // include guards
  2  #ifndef NN_H_INCLUDED
  3  #define NN_H_INCLUDED
  4  
  5  #define MAX_RANK 4
  6  #define MAX_INPUTS 3
  7  #define MAX_TENSOR_NAME 20 // unique for each tensor
  8  #define MAX_SCRATCH_SPACE 1
  9  
 10  // todo: avoid needing to manually sync increment op_type, NUM_OPS, VIS_COLORS when adding a new op
 11  //  - use graphviz's pastel19 or set312 color scheme ?
 12  #define NUM_OPS 23
 13  const char* OP_NAMES[] = {"add", "sub", "mul", "matmul", "pow", "reduce_sum", "relu", "transpose", "batched_matmul", "conv", "batched_conv", "maxpool", "batched_maxpool", "batched_flatten", "select", "log", "exp", "batched_reduce_sum", "repeat", "neg", "div", "reduce_max", "batched_reduce_max"};
 14  const char* VIS_COLORS[] = {"darkolivegreen1", "lightsalmon1", "skyblue1", "plum1", "mediumpurple1", "aquamarine", "yellow", "seashell", "orchid2", "deeppink1", "deeppink3", "darkseagreen1", "darkseagreen3", "beige", "bisque", "cornsilk", "darkolivegreen1", "tan1", "deepskyblue", "chocolate", "slateblue", "lemonchiffon", "lightgoldenrodyellow"};
 15  
 16  
 17  // tensor and its fn's are used in both ops.cpp and main.cpp
 18  struct tensor {
 19      float* data;
 20      int shape[MAX_RANK];
 21      int device;
 22  
 23      // https://arxiv.org/pdf/1102.1523
 24      // Strides the number of bytes to skip in memory to
 25      // proceed to the next element. For a (10, 10)
 26      // array of bytes, for example, the strides may be
 27      // (10, 1), in other words: proceed one byte to
 28      // get to the next column and ten bytes to locate
 29      // the next row.
 30      int stride[MAX_RANK];
 31  
 32      // rank
 33      int num_dims;
 34  
 35      // to avoid: int size = x->shape[0] * x->shape[1];
 36      int size;
 37  
 38      // for autograd engine:
 39  
 40      bool is_leaf;
 41  
 42      tensor* grad;
 43      void (*grad_fn)(tensor* upstream, tensor* out);
 44  
 45      int num_inputs;
 46      tensor* inputs[MAX_INPUTS];
 47      // question-now: malloc memory for this?
 48      int non_grad_inputs[MAX_INPUTS];
 49      int num_uses; // for the AG engine: num outputs of the this tensor left to call their out->grad_fn, before calling grad_fn on the current tensor
 50      int _num_uses;
 51  
 52      // to store data recorded in fwd and used in bwd (wt needing to recompute it in bwd)
 53      // e.g. in relu, reduce_max, maxpool: idxs recorded during forward _k and used in its corresponding _bwd fn
 54      tensor* scratch_space[MAX_SCRATCH_SPACE];
 55  
 56      char* name;
 57      // use char as small int
 58      int op_type;
 59  
 60      void (*backward)(tensor* t);
 61  };
 62  
 63  
 64  // enum device {
 65  //     cpu = 1,
 66  //     cuda = 2,
 67  //     // amd = 3,
 68  //     // tpu = 4,
 69  // };
 70  
 71  // note: do not use 0 to denote a valid device, it conflicts with NULL pointer
 72  // (e.g. checks like "t->device==CPU", if CPU is 0)
 73  #define CPU 1
 74  #define CUDA 2
 75  
 76  // these two will only be used if DEVICE == CUDA
 77  #define NUM_THREADS 16
 78  #define CUDA_DEBUG false
 79  #define DATA_COPY_DEBUG false
 80  
 81  void uniform_init(tensor*);
 82  void kaiming_normal_init(tensor*);
 83  void set_name(tensor*, const char*);
 84  char* random_chars(int);
 85  
 86  tensor* TensorLikeFill(tensor*, float);
 87  tensor* TensorLike(tensor* t);
 88  void* checkMallocErrors(void* ptr);
 89  
 90  void print(tensor*);
 91  void lprint(tensor*);
 92  void cuda_lprint(tensor*);
 93  void sprint(tensor* t);
 94  
 95  struct param {
 96      tensor* value;
 97      // sgd:
 98      tensor* velocity;
 99      // adam:
100      int t;
101      float beta1;
102      float beta2;
103      float epsilon;
104      tensor* first_moment;
105      tensor* second_moment;
106  
107      param* next;
108  };
109  
110  struct tuple {
111      float item_1;
112      float item_2;
113  };
114  
115  
116  tuple* get_tuple(float val1, float val2);
117  
118  #endif