Cradicle Explorer

/ components / kendryte_sdk / src / sipeed_conv.c
sipeed_conv.c
  1  #include "sipeed_conv.h"
  2  #define _P(...) //mp_printf(&mp_plat_print, __VA_ARGS__)
  3  //激活函数折点表，设置为y=x，即直接输出卷积结果
  4  //y=(uint8_t)((((uint64_t)(x - x_start) * y_mul) >> shift) + bias);
  5   
  6  kpu_activate_table_t active_addr __attribute__((aligned(256))) = {
  7   .activate_para = {  //x =36bit
  8    {.data = {.shift_number=0, .y_mul=0, .x_start=0x800000000 }},
  9    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 10    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 11    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 12    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 13    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 14    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 15    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 16    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 17    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 18    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 19    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 20    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 21    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 22    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }},
 23    {.data = {.shift_number=0, .y_mul=1, .x_start=0 }}
 24   },
 25  .activate_para_bias0.data = {
 26    .result_bias = {0,0,0,0,0,0,0,0}
 27   },
 28   .activate_para_bias1.data = {
 29    .result_bias = {0,0,0,0,0,0,0,0}
 30   }
 31  };
 32  
 33  //y = (x*norm_mul)>>norm_shift + norm_add
 34  kpu_batchnorm_argument_t bwsx_base_addr[] __attribute__((aligned(128))) = {
 35   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 36   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 37   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 38   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 39   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 40   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 41   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 42   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 43   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 44   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 45   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 46   {.batchnorm.data = {.norm_mul = 1, .norm_add = 0x0, .norm_shift = 0}},
 47  };
 48  
 49  
 50  //卷积参数
 51  kpu_layer_argument_t la __attribute__((aligned(128)));
 52  //max for 3in*3out, you can modify it
 53  uint16_t conv_data_u16[9*3*3] __attribute__((aligned(128)));
 54  
 55  //池化类型，0表示跳过
 56  //0x1 代表步长为 2 的 2x2 max pooling,
 57  //0x2 代表步长为 2 的 2x2 mean pooling,
 58  //0x3 代表步长为 4 的 4x4 max pooling,
 59  //0x4 代表步长为 4 的 4x4 mean pooling,
 60  //0x5 代表步长为 2 的 2x2 left_top pooling,
 61  //0x6 代表步长为 2 的 2x2 right_bottom pooling,
 62  //0x7 代表步长为 4 的 4x4 left_top pooling,
 63  //0x8 代表步长为 1 的 2x2 mean pooling,
 64  //0x9 代表步长为 1 的 2x2 max pooling
 65  #define AI_MEM_SIZE 0x200000
 66  
 67  static float min(float* data, uint32_t len)
 68  {
 69  	int i;
 70  	float m=data[0];
 71  	for(i=0;i<len;i++)
 72  	{
 73  		if(data[i]<m) m = data[i];
 74  	}
 75  	return m;
 76  }
 77  
 78  static float max(float* data, uint32_t len)
 79  {
 80  	int i;
 81  	float m=data[0];
 82  	for(i=0;i<len;i++)
 83  	{
 84  		if(data[i]>m) m = data[i];
 85  	}
 86  	return m;
 87  }
 88  
 89  //global var: la, active_addr, bwsx_base_addr
 90  static void conv_float2u16(float* data, uint16_t* data_u16, int len)
 91  {
 92  	float dmin, drange,arg_x;
 93  	volatile float scale;
 94  	uint16_t y_mul;
 95  	int i, shift_number;
 96  	dmin=min(data,len);
 97  	drange=max(data,len)-dmin;
 98  	scale = (65535.0/drange);
 99  
100  	//scale conv
101  	_P("convert conv parm: -------------\r\n");
102  	for(i=0;i<len;i++)
103  	{
104  		//float tmp =(float)((double)(data[i]-dmin)*scale);
105  		data_u16[i]=(uint16_t)((data[i]-dmin)*scale);
106  		_P("0x%04x\t",data_u16[i]);
107  		if(i%9==8) {_P("\r\n");}
108  	}
109  	//set arg_x & shr_x
110  	_P("set arg_x & shr_x: -------------\r\n");
111  	arg_x=scale*(dmin>=0?dmin:-dmin);
112  	for(i=0;(arg_x<(float)(0x400000)) && (arg_x!=0);i++)
113  	{
114  		arg_x*=2;
115  		//_P("argx=%f, shrx=%d\r\n", arg_x, i);
116  	}
117  	la.conv_value.data.arg_x = dmin>=0 ? (uint32_t)(arg_x) : (uint32_t)(0x1000000-(uint32_t)arg_x);
118  	la.conv_value.data.shr_x = i;
119  	_P("arg_x=0x%x, shr_x=%d\r\n",la.conv_value.data.arg_x, la.conv_value.data.shr_x);
120  	//set act table
121  	_P("set act table: -------------\r\n");
122  	_P("origin scale=%f\r\n",scale);
123  	scale=1.0/scale;
124  	for(i=0;scale<=16383.0;i++)
125  	{
126  		scale=scale*2;
127  	}
128  	shift_number=i;
129  	y_mul=(uint16_t)(scale);
130  	_P("shift_number=%d, y_mul=%d\r\n", shift_number, y_mul);
131  	for(i=1;i<16;i++)
132  	{
133  		active_addr.activate_para[i].data.shift_number=shift_number;
134  		active_addr.activate_para[i].data.y_mul=y_mul;
135  		active_addr.activate_para[i].data.x_start=0;
136  	}
137  	return;
138  }
139  
140  void sipeed_conv_init(kpu_task_t* task, uint16_t w, uint16_t h, uint8_t ch_in, uint8_t ch_out, float* conv_data) 
141  {
142  	conv_float2u16(conv_data, conv_data_u16, 9*ch_in*ch_out);	//3x3 kernel
143  	la.kernel_offset.data.coef_row_offset = 0;					//固定为0
144  	la.kernel_offset.data.coef_column_offset = 0;				//固定为0
145  	//激活函数配置-
146  	la.kernel_calc_type_cfg.data.load_act=1;					//使能激活函数
147  	la.kernel_calc_type_cfg.data.active_addr = (uint64_t)&active_addr;
148  																//初始化激活表
149  	//row_switch_addr = math.ceil(i_row_wid / 64)
150  	//channel_switch_addr = i_col_high * row_switch_addr	
151  	la.kernel_calc_type_cfg.data.row_switch_addr = (w+63)/64;	//图像宽度占用的单元数
152  	la.kernel_calc_type_cfg.data.channel_switch_addr = (w+63)/64*h; 	
153  	la.kernel_calc_type_cfg.data.coef_size = 0; 				//固定为0
154  	la.kernel_calc_type_cfg.data.coef_group = 1; 		
155  	//中断设置--
156  	la.interrupt_enabe.data.depth_wise_layer = 0; 				//常规卷积层
157  	la.interrupt_enabe.data.int_en = 1;							//使能中断
158  	la.interrupt_enabe.data.full_add = 0; 						//??
159  	la.interrupt_enabe.data.ram_flag = 1;						//??
160  	//dma设置，知道是输出数据使用的DMA--
161  	la.dma_parameter.data.dma_total_byte = w*h*ch_out-1;		//总共的DMA传输数量	
162  	la.dma_parameter.data.send_data_out = 1;					//使能数据的dma输出
163  	la.dma_parameter.data.channel_byte_num = w*h-1;				//单通道的DMA传输数量
164  	//卷积运算参数设置--
165  	// arg_x 为24bit,shr_x 为4bit, 在conv_float2u16中设置
166  	/*	
167  	la.conv_value.data.arg_x = 0;
168  	la.conv_value.data.shr_x = 0;			
169  	la.conv_value.data.arg_w = 0;
170  	la.conv_value.data.shr_w = 0;
171  	la.conv_value2.data.arg_add = 0;
172  	*/
173  	//写回设置--
174  	la.write_back_cfg.data.wb_row_switch_addr = (w+63)/64; 		//ceil(16/64)=1
175  	la.write_back_cfg.data.wb_channel_switch_addr = (w+63)/64*h;			//16*1
176  	la.write_back_cfg.data.wb_group = 1;	//64/w
177  	//图像尺寸设置--
178  	la.image_size.data.i_row_wid = w-1;							//输入长宽
179  	la.image_size.data.i_col_high = h-1;
180  	la.image_size.data.o_row_wid = w-1;							//输出长宽
181  	la.image_size.data.o_col_high = h-1;
182  	//池化类型设置-
183  	la.kernel_pool_type_cfg.data.bypass_conv = 0;				//不略过卷积
184  	la.kernel_pool_type_cfg.data.pad_value = 0x0;				//边界填充0
185  	la.kernel_pool_type_cfg.data.load_para = 1;					//允许归一化
186  	la.kernel_pool_type_cfg.data.pad_type = 0;					//使用填充值
187  	la.kernel_pool_type_cfg.data.kernel_type = 1;				//3x3
188  	la.kernel_pool_type_cfg.data.pool_type = 0;					//池化类型，跳过
189  	la.kernel_pool_type_cfg.data.dma_burst_size = 15;			//dma突发传送大小，16字节
190  	la.kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)&bwsx_base_addr;	
191  																//批归一化首地址
192  	la.kernel_pool_type_cfg.data.first_stride = h<256?0:1;		//图像高度未超过255
193  	//图像通道设置--
194  	la.image_channel_num.data.o_ch_num_coef = ch_out-1;			//一次性参数加载可计算的通道数
195  	la.image_channel_num.data.i_ch_num = ch_in-1;					//输入通道
196  	la.image_channel_num.data.o_ch_num = ch_out-1;				//输出通道
197  	//卷积参数设置-
198  	la.kernel_load_cfg.data.load_time = 0;						//卷积加载次数，不超过72KB，只加载一次
199  	la.kernel_load_cfg.data.para_size = 2*9*ch_in*ch_out;		//卷积参数大小
200  	la.kernel_load_cfg.data.para_start_addr = (uint64_t)conv_data_u16;
201  																//起始地址
202  	la.kernel_load_cfg.data.load_coor = 1;						//允许加载卷积参数
203  	//计算地址设置--
204  	la.image_addr.data.image_src_addr=(uint64_t)0x0;			//一个为0
205  	la.image_addr.data.image_dst_addr=(uint64_t)(AI_MEM_SIZE/64-(w+63)/64*h*ch_out);	
206  
207  	/* init kpu task*/
208  	task->layers = &la;
209  	task->layers_length = 1;    								//单层
210  	task->eight_bit_mode = 0;   								//16bit模式
211  	task->output_scale = 1.0;   								//输出的缩放
212  	task->output_bias = 0;										//输出的偏置
213  }
214  
215  void sipeed_conv_run(kpu_task_t* task, uint8_t* img_src, uint8_t* img_dst, plic_irq_callback_t callback)
216  {
217  	/* start to calculate */
218  	kpu_run(task, DMAC_CHANNEL5, img_src, img_dst, callback);
219  }
220