sipeed_conv.c
1 #include "sipeed_conv.h" 2 #define _P(...) //mp_printf(&mp_plat_print, __VA_ARGS__) 3 //激活函数折点表,设置为y=x,即直接输出卷积结果 4 //y=(uint8_t)((((uint64_t)(x - x_start) * y_mul) >> shift) + bias); 5 6 kpu_activate_table_t active_addr __attribute__((aligned(256))) = { 7 .activate_para = { //x =36bit 8 {.data = {.shift_number=0, .y_mul=0, .x_start=0x800000000 }}, 9 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 10 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 11 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 12 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 13 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 14 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 15 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 16 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 17 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 18 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 19 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 20 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 21 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 22 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}, 23 {.data = {.shift_number=0, .y_mul=1, .x_start=0 }} 24 }, 25 .activate_para_bias0.data = { 26 .result_bias = {0,0,0,0,0,0,0,0} 27 }, 28 .activate_para_bias1.data = { 29 .result_bias = {0,0,0,0,0,0,0,0} 30 } 31 }; 32 33 //y = (x*norm_mul)>>norm_shift + norm_add 34 kpu_batchnorm_argument_t bwsx_base_addr[] __attribute__((aligned(128))) = { 35 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 36 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 37 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 38 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 39 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 40 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 41 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 42 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 43 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 44 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 45 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 46 {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}}, 47 }; 48 49 50 //卷积参数 51 kpu_layer_argument_t la __attribute__((aligned(128))); 52 //max for 3in*3out, you can modify it 53 uint16_t conv_data_u16[9*3*3] __attribute__((aligned(128))); 54 55 //池化类型,0表示跳过 56 //0x1 代表步长为 2 的 2x2 max pooling, 57 //0x2 代表步长为 2 的 2x2 mean pooling, 58 //0x3 代表步长为 4 的 4x4 max pooling, 59 //0x4 代表步长为 4 的 4x4 mean pooling, 60 //0x5 代表步长为 2 的 2x2 left_top pooling, 61 //0x6 代表步长为 2 的 2x2 right_bottom pooling, 62 //0x7 代表步长为 4 的 4x4 left_top pooling, 63 //0x8 代表步长为 1 的 2x2 mean pooling, 64 //0x9 代表步长为 1 的 2x2 max pooling 65 #define AI_MEM_SIZE 0x200000 66 67 static float min(float* data, uint32_t len) 68 { 69 int i; 70 float m=data[0]; 71 for(i=0;i<len;i++) 72 { 73 if(data[i]<m) m = data[i]; 74 } 75 return m; 76 } 77 78 static float max(float* data, uint32_t len) 79 { 80 int i; 81 float m=data[0]; 82 for(i=0;i<len;i++) 83 { 84 if(data[i]>m) m = data[i]; 85 } 86 return m; 87 } 88 89 //global var: la, active_addr, bwsx_base_addr 90 static void conv_float2u16(float* data, uint16_t* data_u16, int len) 91 { 92 float dmin, drange,arg_x; 93 volatile float scale; 94 uint16_t y_mul; 95 int i, shift_number; 96 dmin=min(data,len); 97 drange=max(data,len)-dmin; 98 scale = (65535.0/drange); 99 100 //scale conv 101 _P("convert conv parm: -------------\r\n"); 102 for(i=0;i<len;i++) 103 { 104 //float tmp =(float)((double)(data[i]-dmin)*scale); 105 data_u16[i]=(uint16_t)((data[i]-dmin)*scale); 106 _P("0x%04x\t",data_u16[i]); 107 if(i%9==8) {_P("\r\n");} 108 } 109 //set arg_x & shr_x 110 _P("set arg_x & shr_x: -------------\r\n"); 111 arg_x=scale*(dmin>=0?dmin:-dmin); 112 for(i=0;(arg_x<(float)(0x400000)) && (arg_x!=0);i++) 113 { 114 arg_x*=2; 115 //_P("argx=%f, shrx=%d\r\n", arg_x, i); 116 } 117 la.conv_value.data.arg_x = dmin>=0 ? (uint32_t)(arg_x) : (uint32_t)(0x1000000-(uint32_t)arg_x); 118 la.conv_value.data.shr_x = i; 119 _P("arg_x=0x%x, shr_x=%d\r\n",la.conv_value.data.arg_x, la.conv_value.data.shr_x); 120 //set act table 121 _P("set act table: -------------\r\n"); 122 _P("origin scale=%f\r\n",scale); 123 scale=1.0/scale; 124 for(i=0;scale<=16383.0;i++) 125 { 126 scale=scale*2; 127 } 128 shift_number=i; 129 y_mul=(uint16_t)(scale); 130 _P("shift_number=%d, y_mul=%d\r\n", shift_number, y_mul); 131 for(i=1;i<16;i++) 132 { 133 active_addr.activate_para[i].data.shift_number=shift_number; 134 active_addr.activate_para[i].data.y_mul=y_mul; 135 active_addr.activate_para[i].data.x_start=0; 136 } 137 return; 138 } 139 140 void sipeed_conv_init(kpu_task_t* task, uint16_t w, uint16_t h, uint8_t ch_in, uint8_t ch_out, float* conv_data) 141 { 142 conv_float2u16(conv_data, conv_data_u16, 9*ch_in*ch_out); //3x3 kernel 143 la.kernel_offset.data.coef_row_offset = 0; //固定为0 144 la.kernel_offset.data.coef_column_offset = 0; //固定为0 145 //激活函数配置- 146 la.kernel_calc_type_cfg.data.load_act=1; //使能激活函数 147 la.kernel_calc_type_cfg.data.active_addr = (uint64_t)&active_addr; 148 //初始化激活表 149 //row_switch_addr = math.ceil(i_row_wid / 64) 150 //channel_switch_addr = i_col_high * row_switch_addr 151 la.kernel_calc_type_cfg.data.row_switch_addr = (w+63)/64; //图像宽度占用的单元数 152 la.kernel_calc_type_cfg.data.channel_switch_addr = (w+63)/64*h; 153 la.kernel_calc_type_cfg.data.coef_size = 0; //固定为0 154 la.kernel_calc_type_cfg.data.coef_group = 1; 155 //中断设置-- 156 la.interrupt_enabe.data.depth_wise_layer = 0; //常规卷积层 157 la.interrupt_enabe.data.int_en = 1; //使能中断 158 la.interrupt_enabe.data.full_add = 0; //?? 159 la.interrupt_enabe.data.ram_flag = 1; //?? 160 //dma设置,知道是输出数据使用的DMA-- 161 la.dma_parameter.data.dma_total_byte = w*h*ch_out-1; //总共的DMA传输数量 162 la.dma_parameter.data.send_data_out = 1; //使能数据的dma输出 163 la.dma_parameter.data.channel_byte_num = w*h-1; //单通道的DMA传输数量 164 //卷积运算参数设置-- 165 // arg_x 为24bit,shr_x 为4bit, 在conv_float2u16中设置 166 /* 167 la.conv_value.data.arg_x = 0; 168 la.conv_value.data.shr_x = 0; 169 la.conv_value.data.arg_w = 0; 170 la.conv_value.data.shr_w = 0; 171 la.conv_value2.data.arg_add = 0; 172 */ 173 //写回设置-- 174 la.write_back_cfg.data.wb_row_switch_addr = (w+63)/64; //ceil(16/64)=1 175 la.write_back_cfg.data.wb_channel_switch_addr = (w+63)/64*h; //16*1 176 la.write_back_cfg.data.wb_group = 1; //64/w 177 //图像尺寸设置-- 178 la.image_size.data.i_row_wid = w-1; //输入长宽 179 la.image_size.data.i_col_high = h-1; 180 la.image_size.data.o_row_wid = w-1; //输出长宽 181 la.image_size.data.o_col_high = h-1; 182 //池化类型设置- 183 la.kernel_pool_type_cfg.data.bypass_conv = 0; //不略过卷积 184 la.kernel_pool_type_cfg.data.pad_value = 0x0; //边界填充0 185 la.kernel_pool_type_cfg.data.load_para = 1; //允许归一化 186 la.kernel_pool_type_cfg.data.pad_type = 0; //使用填充值 187 la.kernel_pool_type_cfg.data.kernel_type = 1; //3x3 188 la.kernel_pool_type_cfg.data.pool_type = 0; //池化类型,跳过 189 la.kernel_pool_type_cfg.data.dma_burst_size = 15; //dma突发传送大小,16字节 190 la.kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)&bwsx_base_addr; 191 //批归一化首地址 192 la.kernel_pool_type_cfg.data.first_stride = h<256?0:1; //图像高度未超过255 193 //图像通道设置-- 194 la.image_channel_num.data.o_ch_num_coef = ch_out-1; //一次性参数加载可计算的通道数 195 la.image_channel_num.data.i_ch_num = ch_in-1; //输入通道 196 la.image_channel_num.data.o_ch_num = ch_out-1; //输出通道 197 //卷积参数设置- 198 la.kernel_load_cfg.data.load_time = 0; //卷积加载次数,不超过72KB,只加载一次 199 la.kernel_load_cfg.data.para_size = 2*9*ch_in*ch_out; //卷积参数大小 200 la.kernel_load_cfg.data.para_start_addr = (uint64_t)conv_data_u16; 201 //起始地址 202 la.kernel_load_cfg.data.load_coor = 1; //允许加载卷积参数 203 //计算地址设置-- 204 la.image_addr.data.image_src_addr=(uint64_t)0x0; //一个为0 205 la.image_addr.data.image_dst_addr=(uint64_t)(AI_MEM_SIZE/64-(w+63)/64*h*ch_out); 206 207 /* init kpu task*/ 208 task->layers = &la; 209 task->layers_length = 1; //单层 210 task->eight_bit_mode = 0; //16bit模式 211 task->output_scale = 1.0; //输出的缩放 212 task->output_bias = 0; //输出的偏置 213 } 214 215 void sipeed_conv_run(kpu_task_t* task, uint8_t* img_src, uint8_t* img_dst, plic_irq_callback_t callback) 216 { 217 /* start to calculate */ 218 kpu_run(task, DMAC_CHANNEL5, img_src, img_dst, callback); 219 } 220