/ cifar10.cpp
cifar10.cpp
  1  #include <iostream> // todo: use C only
  2  #include <stdio.h> // structure declaration called FILE
  3  #include <string.h> // memcopy
  4  
  5  
  6  
  7  /*
  8  The binary version contains the files data_batch_1.bin, data_batch_2.bin, ..., data_batch_5.bin, as well as test_batch.bin. Each of these files is formatted as follows:
  9  <1 x label><3072 x pixel>
 10  ...
 11  <1 x label><3072 x pixel>
 12  In other words, the first byte is the label of the first image, which is a number in the range 0-9. The next 3072 bytes are the values of the pixels of the image.
 13  The first 1024 bytes are the red channel values, the next 1024 the green, and the final 1024 the blue.
 14  The values are stored in row-major order, so the first 32 bytes are the red channel values of the first row of the image.
 15  Each file contains 10000 such 3073-byte "rows" of images, although there is nothing delimiting the rows. Therefore each file should be exactly 30730000 bytes long.
 16  There is another file, called batches.meta.txt. This is an ASCII file that maps numeric labels in the range 0-9 to meaningful class names. It is merely a list of the 10 class names, one per row. The class name on row i corresponds to numeric label i.
 17  */
 18  // 3072 (bits per img) / 3(channels) / 8 (bits in 1 byte) / 4 (bytes in 1 float) = 32
 19  // 10000, 3072
 20  // 10000, 3, 32, 32
 21  
 22  
 23  // book, chapter 7.5 file access:
 24  // fp is a pointer to a FILE, and fopen returns a pointer to a FILE
 25  FILE *fopen(char *name, char *mode);
 26  
 27  // getc returns the next character from a file; it needs the file pointer to tell it which file.
 28  // it returns EOF for end of file or error
 29  int getc(FILE *fp);
 30  
 31  struct cifar10 {
 32      tensor* input;
 33      tensor* label;
 34  };
 35  
 36  
 37  void read_file(tensor*, tensor*, const char *);
 38  
 39  // made them global variables to persist between executions
 40  // of read_file, and to avoid passing them as output of this fn
 41  int byte_idx=0, img_idx=0, tensor_data_idx=0;
 42  
 43  cifar10* get_cifar10(void){
 44  
 45      if (N_SAMPLES > 50000) {
 46          printf("[cifar] N_SAMPLES cannot be greater than 50,000\n");
 47          exit(1);
 48      }
 49  
 50      // reset global variables, in case get_validation_batch
 51      // was called before the current function
 52      byte_idx=0, img_idx=0, tensor_data_idx=0;
 53  
 54      set_backend_cpu();
 55      tensor* input = EmptyTensor(N_SAMPLES, 3, 32, 32);
 56      set_name(input, "input");
 57      tensor* label = EmptyTensor(N_SAMPLES, 1);
 58      set_name(label, "label");
 59  
 60      int num_per_file = 10000;
 61  
 62      // ceil: if divides without remainder no need to read one more file
 63      int num_files = N_SAMPLES / num_per_file + (N_SAMPLES % num_per_file != 0);
 64      // int num_files = ceil(N_SAMPLES, num_per_file);
 65  
 66      // maximum 5 batches
 67      num_files = min(5, num_files);
 68  
 69      for (int file_idx=1; file_idx<=num_files; file_idx++){
 70          // todo: determine automatically
 71          char file_name[46];
 72          snprintf(file_name, sizeof(char) * 46, "../data/cifar-10-batches-bin/data_batch_%i.bin", file_idx);
 73  
 74          read_file(input, label, file_name);
 75      }
 76  
 77      // pack into a single structure to be returned by the function
 78      cifar10* dataset = (cifar10*)checkMallocErrors(malloc(sizeof(cifar10)));
 79      dataset->input = input;
 80      dataset->label = label;
 81  
 82      sprint(dataset->input);
 83      sprint(dataset->label);
 84  
 85      set_backend_device();
 86      // note: the dataset is stored on the host, each batch is separately copied to device
 87      return dataset;
 88  }
 89  
 90  void read_file(tensor* input, tensor* label, const char* file_name){
 91  
 92      printf("file_name = %s\n", file_name);
 93  
 94      FILE *fp;
 95      // casting to const char bc mode ("rb") is const char -- and
 96      // fopen doesn't have overload for args: char, const char
 97      if (!(fp = fopen((file_name), "rb"))) {
 98          printf("[cifar] Error: can't access file\n");
 99          exit(1);
100      }
101  
102      // - byte_idx is for indexing the input buffer
103      // - tensor_data_idx is for indexing tensor (IOW output buffer)
104      //      - not the same as byte_idx -- byte_idx is larger bc it was also advanced for each label byte (associated with each img)
105      //      - "data->data[byte_idx] = c/255." is incorrect, as it grows larger than "10000*3*32*32", bc it contains counts for labels
106      //        as well (which my tensor* data has no space for), so need separate idxs for byte_idx and img_idx
107  
108      int c;
109      while ((c = getc(fp)) != EOF){
110  
111          if (img_idx > N_SAMPLES){
112              printf("[cifar] Reached N_SAMPLES, stopping.");
113              break;
114          }
115  
116          // iow: (3*32*32 + 1)
117          if (byte_idx % 3073 == 0){
118              label->data[img_idx] = c;
119              img_idx++;
120          } else {
121              if (tensor_data_idx>=input->size){
122                  printf("\n[cifar10] ERR!");
123                  exit(1);
124              }
125  
126              // note: chose the first normalization below bc
127              // empirically it led to lower loss values
128  
129              // standardizing to range: -0.5:0.5:
130              //   originally c is in range 0:255 -- transform into
131              //   range 0:1, and then shift into range -0.5:0.5
132              float value = (c / 255.) - 0.5;
133              // printf("%f\n", value);
134  
135              // // standardizing to mean 0 and a std 1:
136              // float mean[] = {125.306, 122.950, 113.865};
137              // float std[] = {62.993, 62.088, 66.704};
138              // // which of the 3 channels this value belongs to
139              // int idx = (tensor_data_idx % 3072) / 1024;
140              // float value = (c - mean[idx]) / std[idx];
141  
142              input->data[tensor_data_idx] = value;
143              tensor_data_idx++;
144          }
145          byte_idx++;
146      }
147      // breaks the connection between the file pointer and the external name that was established by f open, freeing the file pointer for another file
148      // also, it flushes the buffer in which p u t c is collecting output
149      fclose(fp);
150  
151      printf("\n[cifar] byte_idx: %i\n", byte_idx); // 30730000
152      printf("\n[cifar] img_idx: %i\n", img_idx); // 10000
153  }
154  
155  
156  cifar10* sample_batch(cifar10* dataset, int batch_size, bool is_random){
157      if (batch_size > N_SAMPLES){
158          printf("[get_batch] error: saw batch_size larger than num samples in the dataset\n");
159          exit(1);
160      }
161  
162      set_backend_cpu();
163      tensor* x = EmptyTensor(batch_size, 3, 32, 32);
164      set_name(x, "x");
165      tensor* y = EmptyTensor(batch_size, 1);
166      set_name(y, "y");
167  
168      for (int i=0; i<batch_size; i++){
169          float* curr_x = x->data + i*x->stride[0];
170          float* curr_y = y->data + i*y->stride[0];
171  
172          int idx;
173          if (is_random){
174              idx = (int)rand() % N_SAMPLES;
175          } else {
176              idx = i;
177          }
178  
179          // select x, y at idx form the dataset
180          float* sampled_x = dataset->input->data + idx * dataset->input->stride[0];
181          int size_of_x = dataset->input->size / N_SAMPLES;
182          memcpy(curr_x, sampled_x, size_of_x * sizeof(float));
183  
184          float* sampled_y = dataset->label->data + idx * dataset->label->stride[0];
185          int size_of_y = dataset->label->size / N_SAMPLES;
186          memcpy(curr_y, sampled_y, size_of_y * sizeof(float));
187      }
188  
189      cifar10* batch = (cifar10*)checkMallocErrors(malloc(sizeof(cifar10)));
190      batch->input = x;
191      batch->label = y;
192  
193      set_backend_device();
194      COPY_TO_DEVICE(batch->input);
195      COPY_TO_DEVICE(batch->label);
196      return batch;
197  }
198  
199  
200  cifar10* get_validation_cifar10(void){
201  
202      set_backend_cpu();
203      tensor* input = EmptyTensor(10000, 3, 32, 32);
204      set_name(input, "val_input");
205      tensor* label = EmptyTensor(10000, 1);
206      set_name(label, "val_label");
207  
208      // reset global variables, they were incremented by
209      // "get_cifar10" fn, assuming it ran before this fn
210      byte_idx=0, img_idx=0, tensor_data_idx=0;
211      read_file(input, label, "../data/cifar-10-batches-bin/test_batch.bin");
212  
213      cifar10* batch = (cifar10*)checkMallocErrors(malloc(sizeof(cifar10)));
214      batch->input = input;
215      batch->label = label;
216  
217      set_backend_device();
218      COPY_TO_DEVICE(batch->input);
219      COPY_TO_DEVICE(batch->label);
220      return batch;
221  }