/ cifar10.cpp
cifar10.cpp
1 #include <iostream> // todo: use C only 2 #include <stdio.h> // structure declaration called FILE 3 #include <string.h> // memcopy 4 5 6 7 /* 8 The binary version contains the files data_batch_1.bin, data_batch_2.bin, ..., data_batch_5.bin, as well as test_batch.bin. Each of these files is formatted as follows: 9 <1 x label><3072 x pixel> 10 ... 11 <1 x label><3072 x pixel> 12 In other words, the first byte is the label of the first image, which is a number in the range 0-9. The next 3072 bytes are the values of the pixels of the image. 13 The first 1024 bytes are the red channel values, the next 1024 the green, and the final 1024 the blue. 14 The values are stored in row-major order, so the first 32 bytes are the red channel values of the first row of the image. 15 Each file contains 10000 such 3073-byte "rows" of images, although there is nothing delimiting the rows. Therefore each file should be exactly 30730000 bytes long. 16 There is another file, called batches.meta.txt. This is an ASCII file that maps numeric labels in the range 0-9 to meaningful class names. It is merely a list of the 10 class names, one per row. The class name on row i corresponds to numeric label i. 17 */ 18 // 3072 (bits per img) / 3(channels) / 8 (bits in 1 byte) / 4 (bytes in 1 float) = 32 19 // 10000, 3072 20 // 10000, 3, 32, 32 21 22 23 // book, chapter 7.5 file access: 24 // fp is a pointer to a FILE, and fopen returns a pointer to a FILE 25 FILE *fopen(char *name, char *mode); 26 27 // getc returns the next character from a file; it needs the file pointer to tell it which file. 28 // it returns EOF for end of file or error 29 int getc(FILE *fp); 30 31 struct cifar10 { 32 tensor* input; 33 tensor* label; 34 }; 35 36 37 void read_file(tensor*, tensor*, const char *); 38 39 // made them global variables to persist between executions 40 // of read_file, and to avoid passing them as output of this fn 41 int byte_idx=0, img_idx=0, tensor_data_idx=0; 42 43 cifar10* get_cifar10(void){ 44 45 if (N_SAMPLES > 50000) { 46 printf("[cifar] N_SAMPLES cannot be greater than 50,000\n"); 47 exit(1); 48 } 49 50 // reset global variables, in case get_validation_batch 51 // was called before the current function 52 byte_idx=0, img_idx=0, tensor_data_idx=0; 53 54 set_backend_cpu(); 55 tensor* input = EmptyTensor(N_SAMPLES, 3, 32, 32); 56 set_name(input, "input"); 57 tensor* label = EmptyTensor(N_SAMPLES, 1); 58 set_name(label, "label"); 59 60 int num_per_file = 10000; 61 62 // ceil: if divides without remainder no need to read one more file 63 int num_files = N_SAMPLES / num_per_file + (N_SAMPLES % num_per_file != 0); 64 // int num_files = ceil(N_SAMPLES, num_per_file); 65 66 // maximum 5 batches 67 num_files = min(5, num_files); 68 69 for (int file_idx=1; file_idx<=num_files; file_idx++){ 70 // todo: determine automatically 71 char file_name[46]; 72 snprintf(file_name, sizeof(char) * 46, "../data/cifar-10-batches-bin/data_batch_%i.bin", file_idx); 73 74 read_file(input, label, file_name); 75 } 76 77 // pack into a single structure to be returned by the function 78 cifar10* dataset = (cifar10*)checkMallocErrors(malloc(sizeof(cifar10))); 79 dataset->input = input; 80 dataset->label = label; 81 82 sprint(dataset->input); 83 sprint(dataset->label); 84 85 set_backend_device(); 86 // note: the dataset is stored on the host, each batch is separately copied to device 87 return dataset; 88 } 89 90 void read_file(tensor* input, tensor* label, const char* file_name){ 91 92 printf("file_name = %s\n", file_name); 93 94 FILE *fp; 95 // casting to const char bc mode ("rb") is const char -- and 96 // fopen doesn't have overload for args: char, const char 97 if (!(fp = fopen((file_name), "rb"))) { 98 printf("[cifar] Error: can't access file\n"); 99 exit(1); 100 } 101 102 // - byte_idx is for indexing the input buffer 103 // - tensor_data_idx is for indexing tensor (IOW output buffer) 104 // - not the same as byte_idx -- byte_idx is larger bc it was also advanced for each label byte (associated with each img) 105 // - "data->data[byte_idx] = c/255." is incorrect, as it grows larger than "10000*3*32*32", bc it contains counts for labels 106 // as well (which my tensor* data has no space for), so need separate idxs for byte_idx and img_idx 107 108 int c; 109 while ((c = getc(fp)) != EOF){ 110 111 if (img_idx > N_SAMPLES){ 112 printf("[cifar] Reached N_SAMPLES, stopping."); 113 break; 114 } 115 116 // iow: (3*32*32 + 1) 117 if (byte_idx % 3073 == 0){ 118 label->data[img_idx] = c; 119 img_idx++; 120 } else { 121 if (tensor_data_idx>=input->size){ 122 printf("\n[cifar10] ERR!"); 123 exit(1); 124 } 125 126 // note: chose the first normalization below bc 127 // empirically it led to lower loss values 128 129 // standardizing to range: -0.5:0.5: 130 // originally c is in range 0:255 -- transform into 131 // range 0:1, and then shift into range -0.5:0.5 132 float value = (c / 255.) - 0.5; 133 // printf("%f\n", value); 134 135 // // standardizing to mean 0 and a std 1: 136 // float mean[] = {125.306, 122.950, 113.865}; 137 // float std[] = {62.993, 62.088, 66.704}; 138 // // which of the 3 channels this value belongs to 139 // int idx = (tensor_data_idx % 3072) / 1024; 140 // float value = (c - mean[idx]) / std[idx]; 141 142 input->data[tensor_data_idx] = value; 143 tensor_data_idx++; 144 } 145 byte_idx++; 146 } 147 // breaks the connection between the file pointer and the external name that was established by f open, freeing the file pointer for another file 148 // also, it flushes the buffer in which p u t c is collecting output 149 fclose(fp); 150 151 printf("\n[cifar] byte_idx: %i\n", byte_idx); // 30730000 152 printf("\n[cifar] img_idx: %i\n", img_idx); // 10000 153 } 154 155 156 cifar10* sample_batch(cifar10* dataset, int batch_size, bool is_random){ 157 if (batch_size > N_SAMPLES){ 158 printf("[get_batch] error: saw batch_size larger than num samples in the dataset\n"); 159 exit(1); 160 } 161 162 set_backend_cpu(); 163 tensor* x = EmptyTensor(batch_size, 3, 32, 32); 164 set_name(x, "x"); 165 tensor* y = EmptyTensor(batch_size, 1); 166 set_name(y, "y"); 167 168 for (int i=0; i<batch_size; i++){ 169 float* curr_x = x->data + i*x->stride[0]; 170 float* curr_y = y->data + i*y->stride[0]; 171 172 int idx; 173 if (is_random){ 174 idx = (int)rand() % N_SAMPLES; 175 } else { 176 idx = i; 177 } 178 179 // select x, y at idx form the dataset 180 float* sampled_x = dataset->input->data + idx * dataset->input->stride[0]; 181 int size_of_x = dataset->input->size / N_SAMPLES; 182 memcpy(curr_x, sampled_x, size_of_x * sizeof(float)); 183 184 float* sampled_y = dataset->label->data + idx * dataset->label->stride[0]; 185 int size_of_y = dataset->label->size / N_SAMPLES; 186 memcpy(curr_y, sampled_y, size_of_y * sizeof(float)); 187 } 188 189 cifar10* batch = (cifar10*)checkMallocErrors(malloc(sizeof(cifar10))); 190 batch->input = x; 191 batch->label = y; 192 193 set_backend_device(); 194 COPY_TO_DEVICE(batch->input); 195 COPY_TO_DEVICE(batch->label); 196 return batch; 197 } 198 199 200 cifar10* get_validation_cifar10(void){ 201 202 set_backend_cpu(); 203 tensor* input = EmptyTensor(10000, 3, 32, 32); 204 set_name(input, "val_input"); 205 tensor* label = EmptyTensor(10000, 1); 206 set_name(label, "val_label"); 207 208 // reset global variables, they were incremented by 209 // "get_cifar10" fn, assuming it ran before this fn 210 byte_idx=0, img_idx=0, tensor_data_idx=0; 211 read_file(input, label, "../data/cifar-10-batches-bin/test_batch.bin"); 212 213 cifar10* batch = (cifar10*)checkMallocErrors(malloc(sizeof(cifar10))); 214 batch->input = input; 215 batch->label = label; 216 217 set_backend_device(); 218 COPY_TO_DEVICE(batch->input); 219 COPY_TO_DEVICE(batch->label); 220 return batch; 221 }