/ serialization.cpp
serialization.cpp
  1  // todo-now:
  2  // think more about how would loading_all_params interact with the GC list?
  3  // simply using save_tensor / load_tensor likely will not correctly work with the GC
  4  
  5  
  6  
  7  void save_tensor(tensor* t, FILE* f){
  8  
  9      // COPY_FROM_DEVICE returns a new tensor, which removes the other fields
 10      //  (besides t->data) -- but I want to preserve the name of the original param;
 11      // I hesitate to just copy t->name in COPY_FROM_DEVICE, because this will result
 12      //    in two different tensors having the same name unless COPY_FROM_DEVICE
 13      //    immediately frees the cpu tensor without weighting for the GC to do it later;
 14      //
 15      // todo-high: longer term change COPY_FROM_DEVICE to modify t->data instead of returning a new tensor?
 16      char* name = t->name;
 17  
 18      if (t->device == CUDA){
 19          t = COPY_FROM_DEVICE(t);
 20      }
 21  
 22      // write / read in a specific order:
 23  
 24      //  (1) fixed sized
 25  
 26      fwrite(&t->num_dims, sizeof(int), 1, f);
 27      fwrite(&t->size, sizeof(int), 1, f);
 28      fwrite(&t->is_leaf, sizeof(bool), 1, f);
 29  
 30      // // not meaningful to save/load these fields:
 31      // fwrite(&t->device, sizeof(int), 1, f);
 32      // fwrite(&t->num_inputs, sizeof(int), 1, f);
 33      // fwrite(&t->num_uses, sizeof(int), 1, f);
 34      // fwrite(&t->_num_uses, sizeof(int), 1, f);
 35  
 36      //  (2) dynamic sized members
 37  
 38      fwrite(t->data, sizeof(float), t->size, f);
 39      fwrite(t->shape, sizeof(int), MAX_RANK, f);
 40      fwrite(t->stride, sizeof(int), MAX_RANK, f);
 41      fwrite(name, sizeof(char), MAX_TENSOR_NAME, f);
 42  
 43      // // not meaningful to save/load these fields -- weight
 44      // // is a leaf tensor, it does not have these fields:
 45      // t->op_type;
 46      // t->grad;
 47      // t->scratch_space;
 48      // t->inputs;
 49      // t->non_grad_inputs;
 50  
 51      // // not meaningful to save/load these -- function
 52      // // pointers don't make sense when the program restarts
 53      // t->grad_fn = NULL;
 54      // t->backward = backward;
 55  }
 56  
 57  
 58  //  this fn assumes exact order in which struct members were written to the file (inside "save_tensor" fn)
 59  tensor* load_tensor(FILE* f){
 60      tensor* t = (tensor*)checkMallocErrors(malloc(sizeof(tensor)));
 61  
 62      //  (1) fixed sized:
 63  
 64      fread(&t->num_dims, sizeof(int), 1, f);
 65      fread(&t->size, sizeof(int), 1, f);
 66      fread(&t->is_leaf, sizeof(bool), 1, f);
 67      // fread(&t->device, sizeof(int), 1, f);
 68      // fread(&t->op_type, sizeof(int), 1, f);
 69      // fread(&t->num_inputs, sizeof(int), 1, f);
 70      // fread(&t->num_uses, sizeof(int), 1, f);
 71      // fread(&t->_num_uses, sizeof(int), 1, f);
 72  
 73      //  (2) dynamic sized members:
 74  
 75      t->data = (float*)checkMallocErrors(malloc(sizeof(float) * t->size));
 76      fread(t->data, sizeof(float), t->size, f);
 77  
 78      // question-now: I don't think need to allocate space for these -- memory
 79      //   for the array members is included when "malloc(sizeof(tensor))" above
 80      fread(t->shape, sizeof(int), MAX_RANK, f);
 81      fread(t->stride, sizeof(int), MAX_RANK, f);
 82  
 83      t->name = (char*)checkMallocErrors(malloc(sizeof(char) * MAX_TENSOR_NAME));
 84      fread(t->name, sizeof(char), MAX_TENSOR_NAME, f);
 85  
 86      // (3) set defaults
 87      t->grad_fn = NULL;
 88      t->grad = NULL;
 89      t->backward = backward;
 90      // COPY_TO_DEVICE expects CPU, sometimes in my constructors
 91      // I don't set the device -- so it's some random value
 92      t->device = CPU;
 93      t->num_inputs = 0;
 94      t->num_uses = 0;
 95  
 96      // mv to device
 97  
 98      // note: need to do the copying after done reading all
 99      // the fields -- otherwise reading gets corrupted
100      if (DEVICE == CUDA){
101          COPY_TO_DEVICE(t);
102      }
103  
104      if (ferror(f)){
105          t = NULL;
106      }
107  
108      return t;
109  }
110  
111  
112  void save_param(param* p, FILE* f){
113  
114      // write / read in a specific order:
115  
116      //  (1) fixed sized
117  
118      fwrite(&p->t, sizeof(int), 1, f);
119      fwrite(&p->beta1, sizeof(float), 1, f);
120      fwrite(&p->beta2, sizeof(float), 1, f);
121      fwrite(&p->epsilon, sizeof(float), 1, f);
122  
123      // can't meaningfully save/restore these:
124      // p->next;
125  
126      //  (2) dynamic sized members
127  
128      save_tensor(p->value, f);
129      save_tensor(p->velocity, f);
130      save_tensor(p->first_moment, f);
131      save_tensor(p->second_moment, f);
132  }
133  
134  
135  param* load_param(FILE* f){
136  
137      param* p = (param*)checkMallocErrors(malloc(sizeof(param)));
138  
139      fread(&p->t, sizeof(int), 1, f);
140      fread(&p->beta1, sizeof(float), 1, f);
141      fread(&p->beta2, sizeof(float), 1, f);
142      fread(&p->epsilon, sizeof(float), 1, f);
143  
144      p->value = load_tensor(f);
145      p->velocity = load_tensor(f);
146      p->first_moment = load_tensor(f);
147      p->second_moment = load_tensor(f);
148  
149      // set defaults
150      p->next = NULL;
151  
152      if (ferror(f)){
153          p = NULL;
154      }
155  
156      return p;
157  }
158  
159  
160  void save_all_params(const char* prefix, int ep_idx){
161  
162      char path[50];
163      snprintf(path, sizeof(char) * 50, "./generated/checkpoints/%s.dat", prefix);
164  
165      // flush buffer
166      FILE *f = fopen(static_cast<const char*>(path), "w");
167      if (!f) {
168          printf("Error opening file\n");
169          exit(1);
170      }
171      fclose(f);
172  
173      f = fopen(static_cast<const char*>(path), "a");
174  
175      // used in load_all_params to know how many times to iterate
176      int num_params = count_params();
177      fwrite(&num_params, sizeof(int), 1, f);
178  
179      float learning_rate = LR;
180      fwrite(&learning_rate, sizeof(float), 1, f);
181  
182      fwrite(&ep_idx, sizeof(int), 1, f);
183  
184      param* temp = param_head;
185      while (temp){
186          save_param(temp, f);
187          temp = temp->next;
188      }
189      fclose(f);
190  
191      printf("[save_all_params] Parameters saved.\n");
192  }
193  
194  
195  int load_all_params(char* prefix){
196      if (param_head){
197          printf("[load_all_params] loading params when then param list is not empty -- is not supported\n");
198          exit(1);
199      }
200  
201      char path[50];
202      snprintf(path, sizeof(char) * 50, "./generated/checkpoints/%s.dat", prefix);
203  
204      FILE *f = fopen(static_cast<const char*>(path), "rb");
205      if (!f) {
206          printf("Error opening file\n");
207          exit(1);
208      }
209  
210      int num_params;
211      fread(&num_params, sizeof(int), 1, f);
212  
213      // not used at the moment
214      float learning_rate;
215      fread(&learning_rate, sizeof(float), 1, f);
216  
217      // need to return this to continue incrementing ep_idxs after loading
218      // (instead of starting again from 0), and avoid overwriting previous checkpoints
219      int ep_idx;
220      fread(&ep_idx, sizeof(int), 1, f);
221  
222      param* loaded;
223      for (int i=0; i<num_params; i++){
224          loaded = load_param(f);
225          printf("[load_all_params] loaded param for %s tensor\n", loaded->value->name);
226          loaded->next = param_head;
227          param_head = loaded;
228      }
229      fclose(f);
230  
231      printf("[load_all_params] Parameters loaded.\n");
232      return ep_idx;
233  }