- Timestamp:
- Sep 14, 2020, 6:36:03 PM (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c
r453 r458 17 17 #include "utils.h" 18 18 #include "kpu_main.h" 19 #include "kernel_cfg.h" 19 20 20 21 #define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b)) 21 22 22 void sysctl_enable_irq(void)23 {24 set_csr(mie, MIP_MEIP);25 set_csr(mstatus, MSTATUS_MIE);26 }27 28 void sysctl_disable_irq(void)29 {30 clear_csr(mie, MIP_MEIP);31 clear_csr(mstatus, MSTATUS_MIE);32 }33 34 23 uint64_t sysctl_get_time_us(void) 35 24 { 36 37 25 uint64_t v_cycle = read_cycle(); 26 return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0; 38 27 } 39 28 40 29 static int is_memory(uintptr_t address) 41 30 { 42 43 44 45 46 47 31 enum 32 { 33 mem_len = 6 * 1024 * 1024, 34 mem_no_cache_len = 8 * 1024 * 1024, 35 }; 36 return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040); 48 37 } 49 38 50 39 uint32_t is_memory_cache(uintptr_t address) 51 40 { 52 53 54 41 #define MEM_CACHE_LEN (6 * 1024 * 1024) 42 43 return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN)); 55 44 } 56 45 57 46 int plic_irq_enable(INTNO irq_number) 58 47 { 59 60 61 62 48 if (irq_number != INTNO_AI) 49 return -1; 50 ena_int(irq_number); 51 return 0; 63 52 } 64 53 65 54 int plic_set_priority(INTNO irq_number, uint32_t priority) 66 55 { 67 68 69 70 56 if (irq_number != INTNO_AI) 57 return -1; 58 set_ipriority(irq_number, priority); 59 return 0; 71 60 } 72 61 … … 76 65 void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx) 77 66 { 78 ER ret; 79 if (irq != INTNO_AI) 80 return; 81 82 ret = loc_cpu(); 83 84 ai_done_callback = callback; 85 ai_done_ctx = ctx; 86 87 if (ret == E_OK) 88 unl_cpu(); 67 if (irq != INTNO_AI) 68 return; 69 70 dis_int(INTNO_AI); 71 72 ai_done_callback = callback; 73 ai_done_ctx = ctx; 74 75 ena_int(INTNO_AI); 89 76 } 90 77 91 78 void ai_done_isr(intptr_t exinf) 92 79 { 93 sysctl_disable_irq(); 94 if (ai_done_callback != NULL){ 95 ai_done_callback(ai_done_ctx); 96 } 97 sysctl_enable_irq(); 80 dis_int(INTNO_AI); 81 if (ai_done_callback != NULL) 82 { 83 ai_done_callback(ai_done_ctx); 84 } 85 ena_int(INTNO_AI); 98 86 } 99 87 … … 102 90 103 91 void kpu_dmac_irq_register(dmac_channel_number_t channel_num, 104 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority) 105 { 106 ER ret; 107 if (channel_num != AI_DMA_CH) 108 return; 109 110 set_ipriority(INTNO_DMAAI, priority); 111 112 ret = loc_cpu(); 113 114 ai_dma_done_callback = dmac_callback; 115 ai_dma_done_ctx = ctx; 116 117 if (ret == E_OK) 118 unl_cpu(); 92 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority) 93 { 94 if (channel_num != AI_DMA_CH) 95 return; 96 97 //set_ipriority(INTNO_DMAAI, priority); 98 99 dis_int(INTNO_DMAAI); 100 101 ai_dma_done_callback = dmac_callback; 102 ai_dma_done_ctx = ctx; 103 104 ena_int(INTNO_DMAAI); 119 105 } 120 106 121 107 void ai_dma_done_isr(DMA_Handle_t *dma) 122 108 { 123 sysctl_disable_irq(); 124 if (ai_dma_done_callback != NULL) { 125 ai_dma_done_callback(ai_dma_done_ctx); 126 } 127 sysctl_enable_irq(); 109 dis_int(INTNO_DMAAI); 110 111 if (ai_dma_done_callback != NULL) 112 { 113 ai_dma_done_callback(ai_dma_done_ctx); 114 } 115 116 ena_int(INTNO_DMAAI); 128 117 } 129 118 130 119 void dmac_set_irq(dmac_channel_number_t channel_num, 131 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority) 132 { 133 ER ret; 134 if (channel_num != AI_DMA_CH) 135 return; 136 137 set_ipriority(INTNO_DMAAI, priority); 138 139 ret = loc_cpu(); 140 141 ai_dma_done_callback = dmac_callback; 142 ai_dma_done_ctx = ctx; 143 144 if (ret == E_OK) 145 unl_cpu(); 120 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority) 121 { 122 if (channel_num != AI_DMA_CH) 123 return; 124 125 //set_ipriority(INTNO_DMAAI, priority); 126 127 dis_int(INTNO_DMAAI); 128 129 ai_dma_done_callback = dmac_callback; 130 ai_dma_done_ctx = ctx; 131 132 ena_int(INTNO_DMAAI); 146 133 } 147 134 … … 149 136 150 137 void dmac_set_single_mode(dmac_channel_number_t channel_num, 151 152 153 154 155 156 { 157 158 159 160 161 162 163 if(mem_type_src == 0 && mem_type_dest == 0)164 165 else if(mem_type_src == 1 && mem_type_dest == 0)166 167 else if(mem_type_src == 0 && mem_type_dest == 1)168 169 170 171 172 hdma->Init.Direction = flow_control;/* DMA転送方向 */173 hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE);/* ソースハンドシェイク */174 hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE);/* デスティネーションハンドシェイク */175 hdma->Init.SrcInc = src_inc;/* ソースインクリメント設定 */176 hdma->Init.DstInc = dest_inc;/* デスティネーションインクリメント設定 */177 hdma->Init.SrcTransWidth = dmac_trans_width;/* ソース転送幅 */178 hdma->Init.DstTransWidth = dmac_trans_width;/* デスティネーション転送幅 */179 hdma->Init.SrcBurstSize = dmac_burst_size;/* ソースバーストサイズ */180 hdma->Init.DstBurstSize = dmac_burst_size;/* デスティネーションバーストサイズ */181 182 183 138 const void *src, void *dest, uint8_t src_inc, 139 uint8_t dest_inc, 140 uint8_t dmac_burst_size, 141 uint8_t dmac_trans_width, 142 size_t block_size) 143 { 144 if (channel_num != AI_DMA_CH) 145 return; 146 147 DMA_Handle_t *hdma = &g_ai_hdma; 148 int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest); 149 uint8_t flow_control; 150 if (mem_type_src == 0 && mem_type_dest == 0) 151 flow_control = DMA_PERIPH_TO_PERIPH; 152 else if (mem_type_src == 1 && mem_type_dest == 0) 153 flow_control = DMA_MEMORY_TO_PERIPH; 154 else if (mem_type_src == 0 && mem_type_dest == 1) 155 flow_control = DMA_PERIPH_TO_MEMORY; 156 else 157 flow_control = DMA_MEMORY_TO_MEMORY; 158 159 hdma->Init.Direction = flow_control; /* DMA転送方向 */ 160 hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */ 161 hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */ 162 hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */ 163 hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */ 164 hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */ 165 hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */ 166 hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */ 167 hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */ 168 dma_reset(hdma); 169 170 dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size); 184 171 } 185 172 … … 199 186 static volatile uint32_t kpu_status; 200 187 201 typedef struct kpu_context202 {203 kpu_task_t kpu_task;204 uint32_t kpu_status;205 } kpu_context_t;206 207 volatile kpu_context_t g_kpu_context;208 209 static int kpu_run_all_done(void *_task)210 {211 atomic_swap(&g_kpu_context.kpu_status, 0);212 kpu_task_t *task = (kpu_task_t *)_task;213 task->callback(task);214 return 0;215 }216 217 int kpu_continue(void *_task)218 {219 kpu_task_t *task = (kpu_task_t *)_task;220 int layer_burst_size = 1;221 222 kpu->interrupt_clear.data = (kpu_config_interrupt_t){223 .calc_done_int = 1,224 .layer_cfg_almost_empty_int = 1,225 .layer_cfg_almost_full_int = 1};226 227 if(task->remain_layers_length == 0)228 {229 return 0;230 }231 if(task->remain_layers_length <= layer_burst_size)232 {233 for(uint32_t i = 0; i < task->remain_layers_length; i++)234 {235 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;236 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;237 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;238 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;239 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;240 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;241 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;242 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;243 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;244 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;245 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;246 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;247 }248 task->remain_layers_length = 0;249 } else250 {251 for(uint32_t i = 0; i < layer_burst_size; i++)252 {253 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;254 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;255 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;256 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;257 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;258 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;259 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;260 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;261 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;262 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;263 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;264 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;265 }266 task->remain_layers += layer_burst_size;267 task->remain_layers_length -= layer_burst_size;268 }269 return 0;270 }271 272 static int kpu_run_dma_output(uint32_t dma_ch, void *dst, uint32_t length, plic_irq_callback_t cb, void *_task)273 {274 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);275 kpu_dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);276 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,277 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);278 return 0;279 }280 281 static int kpu_run_dma_input_done_push_layers(void *_task)282 {283 kpu_task_t *task = (kpu_task_t *)_task;284 kpu->interrupt_clear.reg = 7;285 dma_end(&g_ai_hdma);286 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){287 .fifo_full_threshold = 10, .fifo_empty_threshold = 1};288 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){289 .eight_bit_mode = task->eight_bit_mode};290 291 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];292 293 kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);294 295 kpu->interrupt_mask.data = (kpu_config_interrupt_t){296 .calc_done_int = 0,297 .layer_cfg_almost_empty_int = 0,298 .layer_cfg_almost_full_int = 1};299 kpu_continue(task);300 return 0;301 }302 303 static void kpu_run_dma_input(uint32_t dma_ch, const void *src, plic_irq_callback_t cb, void *_task)304 {305 kpu_task_t *task = _task;306 kpu_layer_argument_t *first_layer = &task->layers[0];307 uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);308 kpu_dmac_irq_register(dma_ch, cb, _task, 1);309 dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,310 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);311 }312 313 int kpu_run(kpu_task_t *v_task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback)314 {315 if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))316 return -1;317 318 memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));319 kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;320 321 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];322 323 uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;324 325 last_layer->dma_parameter.data.send_data_out = 1;326 last_layer->interrupt_enabe.data.int_en = 1;327 328 task->dma_ch = dma_ch;329 task->dst = dest;330 task->dst_length = output_size;331 task->callback = callback;332 task->remain_layers_length = task->layers_length;333 task->remain_layers = task->layers;334 335 plic_set_priority(INTNO_AI, 1);336 plic_irq_register(INTNO_AI, kpu_continue, task);337 plic_irq_enable(INTNO_AI);338 339 kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);340 341 return 0;342 }343 344 uint8_t *kpu_get_output_buf(kpu_task_t *task)345 {346 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];347 size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;348 return malloc(output_size);349 }350 351 void kpu_release_output_buf(uint8_t *output_buf)352 {353 if(output_buf != NULL)354 free(output_buf);355 }356 357 static int kpu_done(void *ctx)358 {359 atomic_swap(&kpu_status, 0);360 kpu_task_t *task = (kpu_task_t *)ctx;361 task->callback(task->ctx);362 return 0;363 }364 365 static int kpu_config_input(void *ctx)366 {367 kpu_task_t *task = (kpu_task_t *)ctx;368 kpu->interrupt_clear.reg = 7;369 if(task->remain_layers_length <= LAYER_BURST_SIZE)370 {371 for(uint32_t i = 0; i < task->remain_layers_length; i++)372 {373 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;374 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;375 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;376 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;377 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;378 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;379 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;380 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;381 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;382 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;383 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;384 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;385 }386 task->remain_layers_length = 0;387 kpu->interrupt_mask.reg = 7;388 } else389 {390 for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)391 {392 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;393 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;394 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;395 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;396 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;397 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;398 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;399 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;400 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;401 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;402 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;403 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;404 }405 task->remain_layers += LAYER_BURST_SIZE;406 task->remain_layers_length -= LAYER_BURST_SIZE;407 }408 return 0;409 }410 411 static void kpu_data_output(kpu_task_t *task)412 {413 select_dma_channel(task->dma_ch, DMA_SELECT_AI_RX_REQ);414 kpu_dmac_irq_register(task->dma_ch, kpu_done, task, 1);415 dmac_set_single_mode(task->dma_ch, (void *)(&kpu->fifo_data_out), (void *)(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,416 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);417 }418 419 static int kpu_data_ready(void *ctx)420 {421 kpu_task_t *task = (kpu_task_t *)ctx;422 423 dma_end(&g_ai_hdma);424 kpu_data_output(task);425 426 kpu->eight_bit_mode.reg = task->eight_bit_mode;427 kpu->interrupt_mask.reg = 7;428 kpu->interrupt_clear.reg = 7;429 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){430 .fifo_full_threshold = 12, .fifo_empty_threshold = 1};431 432 plic_set_priority(INTNO_AI, 2);433 plic_irq_register(INTNO_AI, kpu_config_input, task);434 plic_irq_enable(INTNO_AI);435 kpu_config_input(task);436 kpu->interrupt_mask.data = (kpu_config_interrupt_t){437 .calc_done_int = 1,438 .layer_cfg_almost_empty_int = 0,439 .layer_cfg_almost_full_int = 1};440 return 0;441 }442 443 static void kpu_data_input(kpu_task_t *task)444 {445 if(task->src == NULL)446 {447 kpu_data_ready(task);448 return;449 }450 kpu_dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);451 kpu_layer_argument_t *layer = &task->layers[0];452 dmac_set_single_mode(task->dma_ch, (void *)(uintptr_t)task->src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,453 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);454 }455 456 int kpu_single_task_init(kpu_task_t *task)457 {458 /*459 * AIクロック有効化460 */461 sil_orw_mem((uint32_t *)(TADR_SYSCTL_BASE+TOFF_SYSCTL_CLK_EN_PERI), SYSCTL_CLK_EN_PERI_AI_CLK_EN);462 463 kpu_layer_argument_t *first_layer = &task->layers[0];464 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];465 466 last_layer->dma_parameter.data.send_data_out = 1;467 last_layer->interrupt_enabe.data.int_en = 1;468 task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;469 task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;470 task->dst = (uint64_t *)malloc(task->dst_length * 8);471 if(task->dst == NULL)472 return 1;473 memset(task->dst, 0, task->dst_length * 8);474 return 0;475 }476 477 int kpu_single_task_deinit(kpu_task_t *task)478 {479 free(task->dst);480 return 0;481 }482 483 int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta)484 {485 uintptr_t base_addr = (uintptr_t)buffer;486 kpu_model_header_t *header = (kpu_model_header_t *)buffer;487 kpu_model_layer_metadata_t *layer_meta = (kpu_model_layer_metadata_t *)(base_addr + sizeof(kpu_model_header_t));488 kpu_layer_argument_t *layers = (kpu_layer_argument_t *)(base_addr + header->layers_argument_start);489 490 if(header->version != 1)491 return -1;492 uint32_t layers_length = header->layers_length;493 task->layers_length = layers_length;494 task->eight_bit_mode = header->flags & 1;495 task->layers = layers;496 task->output_scale = layer_meta[layers_length - 1].output_scale;497 task->output_bias = layer_meta[layers_length - 1].output_bias;498 size_t i;499 for(i = 0; i < layers_length; i++)500 {501 layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);502 layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);503 layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);504 }505 506 if(meta)507 *meta = layer_meta;508 return 0;509 }510 511 int kpu_start(kpu_task_t *task)512 {513 if(atomic_cas(&kpu_status, 0, 1))514 return -1;515 516 task->remain_layers_length = task->layers_length;517 task->remain_layers = task->layers;518 kpu_data_input(task);519 return 0;520 }521 522 188 static void kpu_send_layer(const kpu_layer_argument_t *layer) 523 189 { 524 kpu->layer_argument_fifo = layer->interrupt_enabe.reg; 525 kpu->layer_argument_fifo = layer->image_addr.reg; 526 kpu->layer_argument_fifo = layer->image_channel_num.reg; 527 kpu->layer_argument_fifo = layer->image_size.reg; 528 kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg; 529 kpu->layer_argument_fifo = layer->kernel_load_cfg.reg; 530 kpu->layer_argument_fifo = layer->kernel_offset.reg; 531 kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg; 532 kpu->layer_argument_fifo = layer->write_back_cfg.reg; 533 kpu->layer_argument_fifo = layer->conv_value.reg; 534 kpu->layer_argument_fifo = layer->conv_value2.reg; 535 kpu->layer_argument_fifo = layer->dma_parameter.reg; 536 } 537 538 void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata) 539 { 540 kpu->interrupt_clear.reg = 7; 541 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){ 542 .fifo_full_threshold = 10, .fifo_empty_threshold = 1}; 543 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){ 544 .eight_bit_mode = eight_bit_mode}; 545 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 546 .calc_done_int = 1, 547 .layer_cfg_almost_empty_int = 0, 548 .layer_cfg_almost_full_int = 1}; 549 550 plic_set_priority(INTNO_AI, 1); 551 plic_irq_register(INTNO_AI, callback, userdata); 552 plic_irq_enable(INTNO_AI); 190 kpu->layer_argument_fifo = layer->interrupt_enabe.reg; 191 kpu->layer_argument_fifo = layer->image_addr.reg; 192 kpu->layer_argument_fifo = layer->image_channel_num.reg; 193 kpu->layer_argument_fifo = layer->image_size.reg; 194 kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg; 195 kpu->layer_argument_fifo = layer->kernel_load_cfg.reg; 196 kpu->layer_argument_fifo = layer->kernel_offset.reg; 197 kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg; 198 kpu->layer_argument_fifo = layer->write_back_cfg.reg; 199 kpu->layer_argument_fifo = layer->conv_value.reg; 200 kpu->layer_argument_fifo = layer->conv_value2.reg; 201 kpu->layer_argument_fifo = layer->dma_parameter.reg; 553 202 } 554 203 555 204 void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata) 556 205 { 557 558 559 560 206 uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1); 207 dmac_set_irq(dma_ch, callback, userdata, 1); 208 dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT, 209 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8); 561 210 } 562 211 563 212 static void kpu_conv2d_core(kpu_layer_argument_t *layer) 564 213 { 565 214 kpu_send_layer(layer); 566 215 } 567 216 568 217 void kpu_conv2d(kpu_layer_argument_t *layer) 569 218 { 570 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 571 .calc_done_int = 1, 572 .layer_cfg_almost_empty_int = 1, 573 .layer_cfg_almost_full_int = 1}; 574 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 575 .calc_done_int = 1, 576 .layer_cfg_almost_empty_int = 0, 577 .layer_cfg_almost_full_int = 1}; 578 kpu_conv2d_core(layer); 579 } 580 581 void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata) 582 { 583 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 584 .calc_done_int = 1, 585 .layer_cfg_almost_empty_int = 1, 586 .layer_cfg_almost_full_int = 1}; 587 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 588 .calc_done_int = 1, 589 .layer_cfg_almost_empty_int = 1, 590 .layer_cfg_almost_full_int = 1}; 591 layer->dma_parameter.data.send_data_out = 1; 592 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ); 593 dmac_set_irq(dma_ch, callback, userdata, 1); 594 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, 595 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8); 596 kpu_conv2d_core(layer); 597 } 598 599 void kpu_conv2d_output_full_add(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint64_t *dest, plic_irq_callback_t callback, void *userdata) 600 { 601 uint32_t channels = layer->image_channel_num.data.o_ch_num + 1; 602 layer->interrupt_enabe.data.full_add = 1; 603 604 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 605 .calc_done_int = 1, 606 .layer_cfg_almost_empty_int = 1, 607 .layer_cfg_almost_full_int = 1}; 608 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 609 .calc_done_int = 1, 610 .layer_cfg_almost_empty_int = 1, 611 .layer_cfg_almost_full_int = 1}; 612 layer->dma_parameter.data.send_data_out = 1; 613 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ); 614 dmac_set_irq(dma_ch, callback, userdata, 1); 615 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, 616 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels); 617 kpu_conv2d_core(layer); 618 } 619 620 void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, size_t count, uint8_t *dest, const quantize_param_t *dest_param) 621 { 622 quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param; 623 624 size_t i; 625 for(i = 0; i < count; i++) 626 { 627 int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale; 628 if(value < 0) 629 value = 0; 630 if(value > 0xFF) 631 value = 0xFF; 632 *dest++ = value; 633 } 219 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 220 .calc_done_int = 1, 221 .layer_cfg_almost_empty_int = 1, 222 .layer_cfg_almost_full_int = 1}; 223 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 224 .calc_done_int = 1, 225 .layer_cfg_almost_empty_int = 0, 226 .layer_cfg_almost_full_int = 1}; 227 kpu_conv2d_core(layer); 634 228 } 635 229 636 230 void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param) 637 231 { 638 quantize_param_t q1 = *src_param, q2 = *dest_param; 639 size_t oc, y, x; 640 641 if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024) 642 { 643 uint32_t row_padding = 16; 644 uint32_t row_group = 4; 645 uint32_t row_length = 1; 646 uint32_t height = 4; 647 648 for(oc = 0; oc < channels; oc++) 649 { 650 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 651 for(y = 0; y < 1; y++) 652 { 653 uint8_t *y_origin = channel_origin + y * row_length * 64; 654 for(x = 0; x < 1; x++) 655 { 656 int64_t sum = 0; 657 size_t i; 658 for(i = 0; i < kernel_size; i++) 659 sum += *src++; 660 661 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale; 662 if(value < 0) 663 value = 0; 664 if(value > 0xFF) 665 value = 0xFF; 666 y_origin[x] = value; 667 } 668 } 669 } 670 } else 671 { 672 for(oc = 0; oc < channels; oc++) 673 { 674 int64_t sum = 0; 675 size_t i; 676 for(i = 0; i < kernel_size; i++) 677 sum += *src++; 678 679 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale; 680 if(value < 0) 681 value = 0; 682 if(value > 0xFF) 683 value = 0xFF; 684 dest[oc] = value; 685 } 686 } 232 quantize_param_t q1 = *src_param, q2 = *dest_param; 233 size_t oc, y, x; 234 235 if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024) 236 { 237 uint32_t row_padding = 16; 238 uint32_t row_group = 4; 239 uint32_t row_length = 1; 240 uint32_t height = 4; 241 242 for (oc = 0; oc < channels; oc++) 243 { 244 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 245 for (y = 0; y < 1; y++) 246 { 247 uint8_t *y_origin = channel_origin + y * row_length * 64; 248 for (x = 0; x < 1; x++) 249 { 250 int64_t sum = 0; 251 size_t i; 252 for (i = 0; i < kernel_size; i++) 253 sum += *src++; 254 255 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale; 256 if (value < 0) 257 value = 0; 258 if (value > 0xFF) 259 value = 0xFF; 260 y_origin[x] = value; 261 } 262 } 263 } 264 } 265 else 266 { 267 for (oc = 0; oc < channels; oc++) 268 { 269 int64_t sum = 0; 270 size_t i; 271 for (i = 0; i < kernel_size; i++) 272 sum += *src++; 273 274 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale; 275 if (value < 0) 276 value = 0; 277 if (value > 0xFF) 278 value = 0xFF; 279 dest[oc] = value; 280 } 281 } 687 282 } 688 283 689 284 void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest) 690 285 { 691 quantize_param_t q = *src_param; 692 size_t oc; 693 694 for(oc = 0; oc < channels; oc++) 695 { 696 int64_t sum = 0; 697 size_t i; 698 for(i = 0; i < kernel_size; i++) 699 sum += *src++; 700 701 float value = (sum * q.scale + q.bias) / kernel_size; 702 dest[oc] = value; 703 } 704 } 705 706 void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param) 707 { 708 quantize_param_t q1 = *dest_param; 709 size_t i = 0; 710 for(i = 0; i < channels; i++) 711 *dest++ = src[i * 16] * q1.scale + q1.bias; 712 } 713 714 void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels) 715 { 716 int ic, oc; 717 for(oc = 0; oc < output_channels; oc++) 718 { 719 const float *c_weights = weights + oc * input_channels; 720 721 float sum = 0.0f; 722 for(ic = 0; ic < input_channels; ic++) 723 sum += src[ic] * c_weights[ic]; 724 dest[oc] = sum + biases[oc]; 725 } 726 } 727 728 void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest) 729 { 730 quantize_param_t q1 = *src_param; 731 size_t i = 0; 732 for(i = 0; i < count; i++) 733 *dest++ = src[i] * q1.scale + q1.bias; 734 } 735 736 void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels) 737 { 738 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64); 739 size_t oc, y, x; 740 741 uint32_t row_padding; 742 uint32_t row_group; 743 uint32_t row_length; 744 745 if(width <= 16) 746 { 747 row_padding = 16; 748 row_group = 4; 749 row_length = 1; 750 } else if(width <= 32) 751 { 752 row_padding = 32; 753 row_group = 2; 754 row_length = 1; 755 } else 756 { 757 row_padding = 64; 758 row_group = 1; 759 row_length = (width + 63) / 64; 760 } 761 762 for(oc = 0; oc < channels; oc++) 763 { 764 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 765 for(y = 0; y < height; y++) 766 { 767 uint8_t *y_origin = channel_origin + y * row_length * 64; 768 for(x = 0; x < width; x++) 769 y_origin[x] = *src++; 770 } 771 } 772 } 286 quantize_param_t q = *src_param; 287 size_t oc; 288 289 for (oc = 0; oc < channels; oc++) 290 { 291 int64_t sum = 0; 292 size_t i; 293 for (i = 0; i < kernel_size; i++) 294 sum += *src++; 295 296 float value = (sum * q.scale + q.bias) / kernel_size; 297 dest[oc] = value; 298 } 299 } 300 773 301 #if USE_CACHED_AI_RAM 774 302 static void kpu_flush_cache(uint32_t addr, size_t lines) 775 303 { 776 777 for(line = 0; line < lines; line++)778 779 780 781 782 for(i = 0; i < 8; i++)783 784 304 size_t line; 305 for (line = 0; line < lines; line++) 306 { 307 const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64); 308 uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64); 309 size_t i; 310 for (i = 0; i < 8; i++) 311 dest[i] = src[i]; 312 } 785 313 } 786 314 #endif 787 315 static int64_t kpu_carry_shift(int64_t value, uint32_t shift) 788 316 { 789 if(shift > 0) 790 { 791 value >>= shift - 1; 792 if(value & 0x1) 793 { 794 if(value < 0) 795 value = (value >> 1) - 1; 796 else 797 value = (value >> 1) + 1; 798 } else 799 { 800 value >>= 1; 801 } 802 } 803 804 return value; 317 if (shift > 0) 318 { 319 value >>= shift - 1; 320 if (value & 0x1) 321 { 322 if (value < 0) 323 value = (value >> 1) - 1; 324 else 325 value = (value >> 1) + 1; 326 } 327 else 328 { 329 value >>= 1; 330 } 331 } 332 333 return value; 805 334 } 806 335 static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr) 807 336 { 808 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64); 809 size_t oc, y, x; 810 uint32_t row_padding; 811 uint32_t row_group; 812 uint32_t row_length; 813 if(width <= 16) 814 { 815 row_padding = 16; 816 row_group = 4; 817 row_length = 1; 818 } else if(width <= 32) 819 { 820 row_padding = 32; 821 row_group = 2; 822 row_length = 1; 823 } else 824 { 825 row_padding = 64; 826 row_group = 1; 827 row_length = (width + 63) / 64; 828 } 829 830 if((uintptr_t)src % 8 == 0 && width % 8 == 0) 831 { 337 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64); 338 size_t oc, y, x; 339 uint32_t row_padding; 340 uint32_t row_group; 341 uint32_t row_length; 342 if (width <= 16) 343 { 344 row_padding = 16; 345 row_group = 4; 346 row_length = 1; 347 } 348 else if (width <= 32) 349 { 350 row_padding = 32; 351 row_group = 2; 352 row_length = 1; 353 } 354 else 355 { 356 row_padding = 64; 357 row_group = 1; 358 row_length = (width + 63) / 64; 359 } 360 361 if ((uintptr_t)src % 8 == 0 && width % 8 == 0) 362 { 832 363 #define UPLOAD_BEGIN() \ 833 for(oc = 0; oc < channels; oc++)\834 835 836 for(y = 0; y < height; y++)\837 838 364 for (oc = 0; oc < channels; oc++) \ 365 { \ 366 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \ 367 for (y = 0; y < height; y++) \ 368 { \ 369 uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64); 839 370 840 371 #define UPLOAD_END() \ 841 } \ 842 } 843 844 width /= 8; 845 const uint64_t *u64_src = (const uint64_t *)src; 846 if(width == 1) 847 { 848 UPLOAD_BEGIN() 849 y_origin[0] = *u64_src++; 850 UPLOAD_END() 851 } else if(width == 2) 852 { 853 UPLOAD_BEGIN() 854 { 855 y_origin[0] = *u64_src++; 856 y_origin[1] = *u64_src++; 857 } 858 UPLOAD_END() 859 } else if(width == 4) 860 { 861 UPLOAD_BEGIN() 862 { 863 y_origin[0] = *u64_src++; 864 y_origin[1] = *u64_src++; 865 y_origin[2] = *u64_src++; 866 y_origin[3] = *u64_src++; 867 } 868 UPLOAD_END() 869 } else 870 { 871 UPLOAD_BEGIN() 872 for(x = 0; x < width; x++) 873 y_origin[x] = *u64_src++; 874 UPLOAD_END() 875 } 876 } else 877 { 878 for(oc = 0; oc < channels; oc++) 879 { 880 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 881 for(y = 0; y < height; y++) 882 { 883 uint8_t *y_origin = channel_origin + y * row_length * 64; 884 for(x = 0; x < width; x++) 885 y_origin[x] = *src++; 886 } 887 } 888 } 372 } \ 373 } 374 375 width /= 8; 376 const uint64_t *u64_src = (const uint64_t *)src; 377 if (width == 1) 378 { 379 UPLOAD_BEGIN() 380 y_origin[0] = *u64_src++; 381 UPLOAD_END() 382 } 383 else if (width == 2) 384 { 385 UPLOAD_BEGIN() 386 { 387 y_origin[0] = *u64_src++; 388 y_origin[1] = *u64_src++; 389 } 390 UPLOAD_END() 391 } 392 else if (width == 4) 393 { 394 UPLOAD_BEGIN() 395 { 396 y_origin[0] = *u64_src++; 397 y_origin[1] = *u64_src++; 398 y_origin[2] = *u64_src++; 399 y_origin[3] = *u64_src++; 400 } 401 UPLOAD_END() 402 } 403 else 404 { 405 UPLOAD_BEGIN() 406 for (x = 0; x < width; x++) 407 y_origin[x] = *u64_src++; 408 UPLOAD_END() 409 } 410 } 411 else 412 { 413 for (oc = 0; oc < channels; oc++) 414 { 415 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 416 for (y = 0; y < height; y++) 417 { 418 uint8_t *y_origin = channel_origin + y * row_length * 64; 419 for (x = 0; x < width; x++) 420 y_origin[x] = *src++; 421 } 422 } 423 } 889 424 } 890 425 static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src) 891 426 { 892 893 894 895 896 427 size_t width = layer->image_size.data.i_row_wid + 1; 428 size_t height = layer->image_size.data.i_col_high + 1; 429 size_t channels = layer->image_channel_num.data.i_ch_num + 1; 430 431 kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr); 897 432 } 898 433 899 434 static void kpu_kmodel_input_float(const float *src, float *dest, size_t count) 900 435 { 901 436 memcpy(dest, src, count * sizeof(float)); 902 437 } 903 438 904 439 static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act) 905 440 { 906 size_t i; 907 908 if(act == KLA_RELU) 909 { 910 for(i = 0; i < count; i++) 911 data[i] = max(data[i], 0); 912 } else if(act == KLA_RELU6) 913 { 914 for(i = 0; i < count; i++) 915 data[i] = min(max(data[i], 0), 6); 916 } 441 size_t i; 442 443 if (act == KLA_RELU) 444 { 445 for (i = 0; i < count; i++) 446 data[i] = max(data[i], 0); 447 } 448 else if (act == KLA_RELU6) 449 { 450 for (i = 0; i < count; i++) 451 data[i] = min(max(data[i], 0), 6); 452 } 917 453 } 918 454 919 455 static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx) 920 456 { 921 922 923 924 925 926 for(i = 0; i < count; i++)927 457 const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address); 458 const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address); 459 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 460 size_t i, count = arg->count; 461 462 for (i = 0; i < count; i++) 463 dest[i] = src_a[i] + src_b[i]; 928 464 } 929 465 930 466 static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx) 931 467 { 932 933 934 935 936 937 938 939 940 941 942 if(sh_a == sh_b)943 468 const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address); 469 const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address); 470 size_t count = ALIGN_UP(arg->count, 8) / 8; 471 int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift; 472 int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift; 473 int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift; 474 475 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 476 size_t i; 477 478 if (sh_a == sh_b) 479 { 944 480 #define QADD_UNROLL_1(x) \ 945 946 481 int64_t a##x = *src_a++; \ 482 int64_t b##x = *src_b++; 947 483 948 484 #define QADD_UNROLL_2(x) \ 949 950 485 a##x += off_a; \ 486 b##x += off_b; 951 487 952 488 #define QADD_UNROLL_3(x) \ 953 954 489 a##x *= mul_a; \ 490 b##x *= mul_b; 955 491 956 492 #define QADD_UNROLL_4(x) \ 957 493 int64_t v##x = a##x + b##x; 958 494 959 495 #define QADD_UNROLL_5(x) \ 960 496 v##x >>= sh_a; 961 497 962 498 #define QADD_UNROLL_6(x) \ 963 499 v##x *= mul_o; 964 500 965 501 #define QADD_UNROLL_7(x) \ 966 502 v##x = kpu_carry_shift(v##x, sh_o); 967 503 968 504 #define QADD_UNROLL_8(x) \ 969 505 v##x += off_o; 970 506 971 507 #define QADD_UNROLL_9(x) \ 972 508 v##x = min(0xFF, max(0, v##x)); 973 509 974 510 #define QADD_UNROLL_10(x) \ 975 511 *dest++ = v##x; 976 512 977 513 #define QADD_UNROLL_S(x) \ 978 QADD_UNROLL_##x(0) \ 979 QADD_UNROLL_##x(1) \ 980 QADD_UNROLL_##x(2) \ 981 QADD_UNROLL_##x(3) \ 982 QADD_UNROLL_##x(4) \ 983 QADD_UNROLL_##x(5) \ 984 QADD_UNROLL_##x(6) \ 985 QADD_UNROLL_##x(7) 986 987 for(i = 0; i < count; i++) 988 { 989 QADD_UNROLL_S(1); 990 QADD_UNROLL_S(2); 991 QADD_UNROLL_S(3); 992 QADD_UNROLL_S(4); 993 QADD_UNROLL_S(5); 994 QADD_UNROLL_S(6); 995 QADD_UNROLL_S(7); 996 QADD_UNROLL_S(8); 997 QADD_UNROLL_S(9); 998 QADD_UNROLL_S(10); 999 } 1000 } else 1001 { 514 QADD_UNROLL_##x(0) \ 515 QADD_UNROLL_##x(1) \ 516 QADD_UNROLL_##x(2) \ 517 QADD_UNROLL_##x(3) \ 518 QADD_UNROLL_##x(4) \ 519 QADD_UNROLL_##x(5) \ 520 QADD_UNROLL_##x(6) \ 521 QADD_UNROLL_##x(7) 522 523 for (i = 0; i < count; i++) 524 { 525 QADD_UNROLL_S(1); 526 QADD_UNROLL_S(2); 527 QADD_UNROLL_S(3); 528 QADD_UNROLL_S(4); 529 QADD_UNROLL_S(5); 530 QADD_UNROLL_S(6); 531 QADD_UNROLL_S(7); 532 QADD_UNROLL_S(8); 533 QADD_UNROLL_S(9); 534 QADD_UNROLL_S(10); 535 } 536 } 537 else 538 { 1002 539 #undef QADD_UNROLL_1 1003 540 #define QADD_UNROLL_1(x) \ 1004 1005 541 int64_t a##x = *src_a++; \ 542 int64_t b##x = *src_b++; 1006 543 1007 544 #undef QADD_UNROLL_2 1008 545 #define QADD_UNROLL_2(x) \ 1009 1010 546 a##x += off_a; \ 547 b##x += off_b; 1011 548 1012 549 #undef QADD_UNROLL_3 1013 550 #define QADD_UNROLL_3(x) \ 1014 1015 551 a##x *= mul_a; \ 552 b##x *= mul_b; 1016 553 1017 554 #undef QADD_UNROLL_4 1018 555 #define QADD_UNROLL_4(x) \ 1019 1020 556 a##x >>= sh_a; \ 557 b##x >>= sh_b; 1021 558 1022 559 #undef QADD_UNROLL_5 1023 560 #define QADD_UNROLL_5(x) \ 1024 561 int64_t v##x = a##x + b##x; 1025 562 1026 563 #undef QADD_UNROLL_6 1027 564 #define QADD_UNROLL_6(x) \ 1028 565 v##x *= mul_o; 1029 566 1030 567 #undef QADD_UNROLL_7 1031 568 #define QADD_UNROLL_7(x) \ 1032 569 v##x = kpu_carry_shift(v##x, sh_o); 1033 570 1034 571 #undef QADD_UNROLL_8 1035 572 #define QADD_UNROLL_8(x) \ 1036 573 v##x += off_o; 1037 574 1038 575 #undef QADD_UNROLL_9 1039 576 #define QADD_UNROLL_9(x) \ 1040 577 v##x = min(0xFF, max(0, v##x)); 1041 578 1042 579 #undef QADD_UNROLL_10 1043 580 #define QADD_UNROLL_10(x) \ 1044 581 *dest++ = v##x; 1045 582 1046 583 #undef QADD_UNROLL_S 1047 584 #define QADD_UNROLL_S(x) \ 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 for(i = 0; i < count; i++)1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 585 QADD_UNROLL_##x(0) \ 586 QADD_UNROLL_##x(1) \ 587 QADD_UNROLL_##x(2) \ 588 QADD_UNROLL_##x(3) \ 589 QADD_UNROLL_##x(4) \ 590 QADD_UNROLL_##x(5) \ 591 QADD_UNROLL_##x(6) \ 592 QADD_UNROLL_##x(7) 593 594 for (i = 0; i < count; i++) 595 { 596 QADD_UNROLL_S(1); 597 QADD_UNROLL_S(2); 598 QADD_UNROLL_S(3); 599 QADD_UNROLL_S(4); 600 QADD_UNROLL_S(5); 601 QADD_UNROLL_S(6); 602 QADD_UNROLL_S(7); 603 QADD_UNROLL_S(8); 604 QADD_UNROLL_S(9); 605 QADD_UNROLL_S(10); 606 } 607 } 1071 608 } 1072 609 1073 610 static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx) 1074 611 { 1075 1076 1077 1078 1079 for(oc = 0; oc < channels; oc++)1080 1081 1082 1083 for(i = 0; i < kernel_size; i++)1084 1085 1086 1087 612 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 613 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 614 size_t oc, channels = arg->channels, kernel_size = arg->kernel_size; 615 616 for (oc = 0; oc < channels; oc++) 617 { 618 float sum = 0.f; 619 size_t i; 620 for (i = 0; i < kernel_size; i++) 621 sum += *src++; 622 623 dest[oc] = sum / kernel_size; 624 } 1088 625 } 1089 626 1090 627 static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx) 1091 628 { 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 for(oc = 0; oc < out_shape.channels; oc++)1102 1103 1104 for(out_y = 0; out_y < out_shape.height; out_y++)1105 1106 for(out_x = 0; out_x < out_shape.width; out_x++)1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)1118 1119 for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 629 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 630 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 631 kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape; 632 uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height; 633 uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height; 634 uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height; 635 636 uint32_t out_y, out_x, oc; 637 638 for (oc = 0; oc < out_shape.channels; oc++) 639 { 640 const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc; 641 for (out_y = 0; out_y < out_shape.height; out_y++) 642 { 643 for (out_x = 0; out_x < out_shape.width; out_x++) 644 { 645 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width; 646 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height; 647 int32_t kernel_x_start = max(0, -in_x_origin); 648 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin); 649 int32_t kernel_y_start = max(0, -in_y_origin); 650 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin); 651 uint8_t value = 0; 652 653 int32_t kernel_y, kernel_x; 654 for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++) 655 { 656 for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++) 657 { 658 int32_t in_x = in_x_origin + kernel_x; 659 int32_t in_y = in_y_origin + kernel_y; 660 value = max(value, channel_src[in_y * in_shape.width + in_x]); 661 } 662 } 663 664 *dest++ = value; 665 } 666 } 667 } 1131 668 } 1132 669 1133 670 static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx) 1134 671 { 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 for(oc = 0; oc < out_shape.channels; oc++)1145 1146 1147 for(out_y = 0; out_y < out_shape.height; out_y++)1148 1149 for(out_x = 0; out_x < out_shape.width; out_x++)1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)1162 1163 for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 672 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 673 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 674 kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape; 675 uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height; 676 uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height; 677 uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height; 678 679 uint32_t out_y, out_x, oc; 680 681 for (oc = 0; oc < out_shape.channels; oc++) 682 { 683 const float *channel_src = src + in_shape.width * in_shape.height * oc; 684 for (out_y = 0; out_y < out_shape.height; out_y++) 685 { 686 for (out_x = 0; out_x < out_shape.width; out_x++) 687 { 688 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width; 689 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height; 690 int32_t kernel_x_start = max(0, -in_x_origin); 691 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin); 692 int32_t kernel_y_start = max(0, -in_y_origin); 693 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin); 694 float value = 0; 695 float kernel_count = 0; 696 697 int32_t kernel_y, kernel_x; 698 for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++) 699 { 700 for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++) 701 { 702 int32_t in_x = in_x_origin + kernel_x; 703 int32_t in_y = in_y_origin + kernel_y; 704 value += channel_src[in_y * in_shape.width + in_x]; 705 kernel_count++; 706 } 707 } 708 709 *dest++ = value / kernel_count; 710 } 711 } 712 } 1176 713 } 1177 714 1178 715 static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx) 1179 716 { 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 for(i = 0; i < count; i++)1190 1191 1192 if(value < 0)1193 1194 if(value > 0xFF)1195 1196 1197 717 size_t count = arg->count; 718 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 719 720 kpu_model_quant_param_t q = arg->quant_param; 721 722 float scale = 1.f / q.scale; 723 724 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address); 725 size_t i; 726 for (i = 0; i < count; i++) 727 { 728 int value = roundf((*src++ - q.bias) * scale); 729 if (value < 0) 730 value = 0; 731 if (value > 0xFF) 732 value = 0xFF; 733 *dest++ = (uint8_t)value; 734 } 1198 735 } 1199 736 1200 737 static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx) 1201 738 { 1202 1203 1204 1205 1206 1207 for(oc = 0; oc < count; oc++)1208 739 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 740 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 741 size_t oc, count = arg->count; 742 kpu_model_quant_param_t q = arg->quant_param; 743 744 for (oc = 0; oc < count; oc++) 745 dest[oc] = *src++ * q.scale + q.bias; 1209 746 } 1210 747 1211 748 static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx) 1212 749 { 1213 1214 1215 1216 1217 for(oc = 0; oc < channels; oc++)1218 1219 1220 1221 for(i = 0; i < count; i++)1222 1223 750 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 751 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 752 size_t oc, i, channels = arg->channels, count = arg->channel_size; 753 754 for (oc = 0; oc < channels; oc++) 755 { 756 const kpu_model_quant_param_t q = arg->quant_params[oc]; 757 758 for (i = 0; i < count; i++) 759 *dest++ = *src++ * q.scale + q.bias; 760 } 1224 761 } 1225 762 1226 763 static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx) 1227 764 { 1228 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 1229 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 1230 size_t oc, count = arg->count; 1231 const uint8_t *table = arg->table; 1232 1233 if(false && count % 8 == 0) 1234 { 1235 for(oc = 0; oc < count;) 1236 { 1237 dest[oc++] = table[*src++]; 1238 dest[oc++] = table[*src++]; 1239 dest[oc++] = table[*src++]; 1240 dest[oc++] = table[*src++]; 1241 dest[oc++] = table[*src++]; 1242 dest[oc++] = table[*src++]; 1243 dest[oc++] = table[*src++]; 1244 dest[oc++] = table[*src++]; 1245 } 1246 } else 1247 { 1248 for(oc = 0; oc < count; oc++) 1249 dest[oc] = table[src[oc]]; 1250 } 765 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 766 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 767 size_t oc, count = arg->count; 768 const uint8_t *table = arg->table; 769 770 if (false && count % 8 == 0) 771 { 772 for (oc = 0; oc < count;) 773 { 774 dest[oc++] = table[*src++]; 775 dest[oc++] = table[*src++]; 776 dest[oc++] = table[*src++]; 777 dest[oc++] = table[*src++]; 778 dest[oc++] = table[*src++]; 779 dest[oc++] = table[*src++]; 780 dest[oc++] = table[*src++]; 781 dest[oc++] = table[*src++]; 782 } 783 } 784 else 785 { 786 for (oc = 0; oc < count; oc++) 787 dest[oc] = table[src[oc]]; 788 } 1251 789 } 1252 790 1253 791 static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx) 1254 792 { 1255 1256 1257 1258 1259 1260 1261 for(oc = 0; oc < channels; oc++)1262 1263 if(sum < epsilon)1264 1265 1266 for(oc = 0; oc < channels; oc++)1267 793 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 794 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 795 size_t oc, channels = arg->channels; 796 797 float sum = 0.f; 798 const float epsilon = 1e-10f; 799 for (oc = 0; oc < channels; oc++) 800 sum += src[oc] * src[oc]; 801 if (sum < epsilon) 802 sum = epsilon; 803 sum = 1.f / sqrtf(sum); 804 for (oc = 0; oc < channels; oc++) 805 dest[oc] = src[oc] * sum; 1268 806 } 1269 807 1270 808 static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx) 1271 809 { 1272 1273 1274 1275 1276 1277 for(oc = 0; oc < channels; oc++)1278 1279 1280 1281 for(oc = 0; oc < channels; oc++)1282 1283 1284 1285 1286 1287 1288 for(oc = 0; oc < channels; oc++)1289 810 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 811 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 812 size_t oc, channels = arg->channels; 813 814 float max = FLT_MIN; 815 for (oc = 0; oc < channels; oc++) 816 max = fmaxf(max, src[oc]); 817 818 float sum = 0.f; 819 for (oc = 0; oc < channels; oc++) 820 { 821 float value = expf(src[oc] - max); 822 sum += value; 823 dest[oc] = value; 824 } 825 826 for (oc = 0; oc < channels; oc++) 827 dest[oc] /= sum; 1290 828 } 1291 829 1292 830 static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx) 1293 831 { 1294 1295 1296 1297 for(i = 0; i < count; i++)1298 1299 1300 1301 1302 1303 832 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 833 uint32_t count = arg->input_count, i; 834 835 for (i = 0; i < count; i++) 836 { 837 kpu_model_memory_range_t input = arg->inputs_mem[i]; 838 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start); 839 memcpy(dest, src, input.size); 840 dest += input.size; 841 } 1304 842 } 1305 843 1306 844 static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx) 1307 845 { 1308 1309 1310 1311 1312 1313 1314 1315 1316 if(in_channels % 8 == 0)1317 846 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 847 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 848 uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc; 849 float *weights = (float *)malloc(in_channels * out_channels * sizeof(float)); 850 float *bias = (float *)malloc(out_channels * sizeof(float)); 851 memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float)); 852 memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float)); 853 854 if (in_channels % 8 == 0) 855 { 1318 856 #define FC_UNROLL_1(x) \ 1319 1320 857 float i##x = *c_src++; \ 858 float w##x = *c_weights++; 1321 859 1322 860 #define FC_UNROLL_2(x) \ 1323 861 sum += i##x * w##x; 1324 862 1325 863 #define FC_UNROLL_S(x) \ 1326 FC_UNROLL_##x(0) \ 1327 FC_UNROLL_##x(1) \ 1328 FC_UNROLL_##x(2) \ 1329 FC_UNROLL_##x(3) \ 1330 FC_UNROLL_##x(4) \ 1331 FC_UNROLL_##x(5) \ 1332 FC_UNROLL_##x(6) \ 1333 FC_UNROLL_##x(7) 1334 1335 for(oc = 0; oc < out_channels; oc++) 1336 { 1337 const float *c_src = src; 1338 const float *c_weights = weights + oc * in_channels; 1339 1340 float sum = 0.0f; 1341 for(ic = 0; ic < in_channels / 8; ic++) 1342 { 1343 FC_UNROLL_S(1); 1344 FC_UNROLL_S(2); 1345 } 1346 1347 dest[oc] = sum + bias[oc]; 1348 } 1349 } else 1350 { 1351 for(oc = 0; oc < out_channels; oc++) 1352 { 1353 const float *c_weights = weights + oc * in_channels; 1354 1355 float sum = 0.0f; 1356 for(ic = 0; ic < in_channels; ic++) 1357 sum += src[ic] * c_weights[ic]; 1358 dest[oc] = sum + bias[oc]; 1359 } 1360 } 1361 free(weights); 1362 free(bias); 1363 kpu_float_activation(dest, out_channels, arg->act); 864 FC_UNROLL_##x(0) \ 865 FC_UNROLL_##x(1) \ 866 FC_UNROLL_##x(2) \ 867 FC_UNROLL_##x(3) \ 868 FC_UNROLL_##x(4) \ 869 FC_UNROLL_##x(5) \ 870 FC_UNROLL_##x(6) \ 871 FC_UNROLL_##x(7) 872 873 for (oc = 0; oc < out_channels; oc++) 874 { 875 const float *c_src = src; 876 const float *c_weights = weights + oc * in_channels; 877 878 float sum = 0.0f; 879 for (ic = 0; ic < in_channels / 8; ic++) 880 { 881 FC_UNROLL_S(1); 882 FC_UNROLL_S(2); 883 } 884 885 dest[oc] = sum + bias[oc]; 886 } 887 } 888 else 889 { 890 for (oc = 0; oc < out_channels; oc++) 891 { 892 const float *c_weights = weights + oc * in_channels; 893 894 float sum = 0.0f; 895 for (ic = 0; ic < in_channels; ic++) 896 sum += src[ic] * c_weights[ic]; 897 dest[oc] = sum + bias[oc]; 898 } 899 } 900 free(weights); 901 free(bias); 902 kpu_float_activation(dest, out_channels, arg->act); 1364 903 } 1365 904 1366 905 static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx) 1367 906 { 1368 1369 1370 1371 1372 1373 for(oy = 0; oy < in_shape.height; oy++)1374 for(ox = 0; ox < in_shape.width; ox++)1375 for(oc = 0; oc < in_shape.channels; oc++)1376 907 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 908 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 909 kpu_model_shape_t in_shape = arg->shape; 910 uint32_t oc, oy, ox; 911 912 for (oy = 0; oy < in_shape.height; oy++) 913 for (ox = 0; ox < in_shape.width; ox++) 914 for (oc = 0; oc < in_shape.channels; oc++) 915 *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox]; 1377 916 } 1378 917 1379 918 static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx) 1380 919 { 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 for(oc = 0; oc < in_shape.channels; oc++)1391 1392 1393 for(oy = 0; oy < out_height; oy++)1394 1395 1396 1397 for(ox = 0; ox < out_width; ox++)1398 1399 1400 1401 1402 1403 920 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 921 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 922 kpu_model_shape_t in_shape = arg->in_shape; 923 uint32_t out_width = arg->out_width, out_height = arg->out_height; 924 uint32_t oc, oy, ox; 925 926 float height_scale = (float)in_shape.height / out_height; 927 float width_scale = (float)in_shape.width / out_width; 928 929 for (oc = 0; oc < in_shape.channels; oc++) 930 { 931 const float *channel_src = src + in_shape.width * in_shape.height * oc; 932 for (oy = 0; oy < out_height; oy++) 933 { 934 uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1); 935 const float *y_origin = channel_src + in_y * in_shape.width; 936 for (ox = 0; ox < out_width; ox++) 937 { 938 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1); 939 *dest++ = y_origin[in_x]; 940 } 941 } 942 } 1404 943 } 1405 944 1406 945 static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx) 1407 946 { 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 for(oc = 0; oc < in_shape.channels; oc++)1418 1419 1420 for(oy = 0; oy < out_height; oy++)1421 1422 1423 1424 for(ox = 0; ox < out_width; ox++)1425 1426 1427 1428 1429 1430 947 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 948 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 949 kpu_model_shape_t in_shape = arg->in_shape; 950 uint32_t out_width = arg->out_width, out_height = arg->out_height; 951 uint32_t oc, oy, ox; 952 953 float height_scale = (float)in_shape.height / out_height; 954 float width_scale = (float)in_shape.width / out_width; 955 956 for (oc = 0; oc < in_shape.channels; oc++) 957 { 958 const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc; 959 for (oy = 0; oy < out_height; oy++) 960 { 961 uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1); 962 const uint8_t *y_origin = channel_src + in_y * in_shape.width; 963 for (ox = 0; ox < out_width; ox++) 964 { 965 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1); 966 *dest++ = y_origin[in_x]; 967 } 968 } 969 } 1431 970 } 1432 971 1433 972 static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx) 1434 973 { 1435 1436 1437 1438 1439 for(oc = 0; oc < channels; oc++)1440 974 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address); 975 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address); 976 size_t oc, channels = arg->channels; 977 978 for (oc = 0; oc < channels; oc++) 979 dest[oc] = 1.f / (1.f + expf(-src[oc])); 1441 980 } 1442 981 1443 982 static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx) 1444 983 { 1445 volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset); 1446 layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM; 1447 layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM; 1448 layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM; 1449 1450 if(arg->flags & KLF_MAIN_MEM_OUT) 1451 { 1452 dmac_channel_number_t dma_ch = ctx->dma_ch; 1453 uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address; 1454 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 1455 .calc_done_int = 1, 1456 .layer_cfg_almost_empty_int = 1, 1457 .layer_cfg_almost_full_int = 1}; 1458 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1459 .calc_done_int = 1, 1460 .layer_cfg_almost_empty_int = 1, 1461 .layer_cfg_almost_full_int = 1}; 1462 layer.dma_parameter.data.send_data_out = 1; 1463 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ); 1464 if(ctx->current_layer < ctx->layers_length) 1465 dmac_set_irq(dma_ch, ai_step, ctx, 1); 1466 else 1467 dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1); 1468 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, 1469 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8); 1470 } else 1471 { 1472 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 1473 .calc_done_int = 1, 1474 .layer_cfg_almost_empty_int = 1, 1475 .layer_cfg_almost_full_int = 1}; 1476 1477 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1478 .calc_done_int = 0, 1479 .layer_cfg_almost_empty_int = 1, 1480 .layer_cfg_almost_full_int = 1}; 1481 layer.interrupt_enabe.data.int_en = 1; 1482 } 1483 1484 kpu_send_layer((const kpu_layer_argument_t *)&layer); 984 volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset); 985 layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM; 986 layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM; 987 layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM; 988 989 if (arg->flags & KLF_MAIN_MEM_OUT) 990 { 991 dmac_channel_number_t dma_ch = ctx->dma_ch; 992 uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address; 993 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 994 .calc_done_int = 1, 995 .layer_cfg_almost_empty_int = 1, 996 .layer_cfg_almost_full_int = 1}; 997 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 998 .calc_done_int = 1, 999 .layer_cfg_almost_empty_int = 1, 1000 .layer_cfg_almost_full_int = 1}; 1001 layer.dma_parameter.data.send_data_out = 1; 1002 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ); 1003 if (ctx->current_layer < ctx->layers_length) 1004 dmac_set_irq(dma_ch, ai_step, ctx, 1); 1005 else 1006 dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1); 1007 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT, 1008 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8); 1009 } 1010 else 1011 { 1012 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 1013 .calc_done_int = 1, 1014 .layer_cfg_almost_empty_int = 1, 1015 .layer_cfg_almost_full_int = 1}; 1016 1017 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1018 .calc_done_int = 0, 1019 .layer_cfg_almost_empty_int = 1, 1020 .layer_cfg_almost_full_int = 1}; 1021 layer.interrupt_enabe.data.int_en = 1; 1022 } 1023 1024 kpu_send_layer((const kpu_layer_argument_t *)&layer); 1485 1025 } 1486 1026 1487 1027 static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx) 1488 1028 { 1489 1029 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 1490 1030 #if USE_CACHED_AI_RAM 1491 1031 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64); 1492 1032 #else 1493 1033 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64); 1494 1034 #endif 1495 1035 1496 1497 1498 1499 1500 1501 1502 for(oc = 0; oc < channels; oc++)1503 1504 1505 for(y = 0; y < 1; y++)1506 1507 1508 for(x = 0; x < 1; x++)1509 1510 1511 1036 uint32_t row_padding = 16; 1037 uint32_t row_group = 4; 1038 uint32_t row_length = 1; 1039 uint32_t height = 4; 1040 uint32_t oc, x, y, channels = arg->channels; 1041 1042 for (oc = 0; oc < channels; oc++) 1043 { 1044 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; 1045 for (y = 0; y < 1; y++) 1046 { 1047 uint8_t *y_origin = channel_origin + y * row_length * 64; 1048 for (x = 0; x < 1; x++) 1049 y_origin[x] = *src++; 1050 } 1051 } 1512 1052 1513 1053 #if USE_CACHED_AI_RAM 1514 1515 1054 uint32_t lines = row_length * height * channels / row_group; 1055 kpu_flush_cache(arg->kpu_mem_out_address, lines); 1516 1056 #endif 1517 1057 } … … 1519 1059 static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx) 1520 1060 { 1521 1522 1523 1524 1525 for(oc = 0; oc < channels; oc++)1526 1061 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address); 1062 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address); 1063 uint32_t oc, channels = arg->channels; 1064 1065 for (oc = 0; oc < channels; oc++) 1066 *dest++ = src[oc * 16]; 1527 1067 } 1528 1068 1529 1069 static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx) 1530 1070 { 1531 1532 1533 1534 1535 1071 size_t width = arg->width; 1072 size_t height = arg->height; 1073 size_t channels = arg->channels; 1074 1075 kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address); 1536 1076 } 1537 1077 … … 1539 1079 { 1540 1080 #if FIX_CACHE 1541 1081 configASSERT(is_memory_cache((uintptr_t)buffer)); 1542 1082 #endif 1543 uintptr_t base_addr = (uintptr_t)buffer; 1544 const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer; 1545 1546 if (header->version == 3 && header->arch == 0) 1547 { 1548 ctx->is_nncase = 0; 1549 ctx->model_buffer = buffer; 1550 ctx->output_count = header->output_count; 1551 ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t)); 1552 ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count); 1553 ctx->layers_length = header->layers_length; 1554 ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length); 1555 ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage); 1556 if (!ctx->main_buffer) 1557 return -1; 1558 uint32_t body_size = 0; 1559 for (int i=0; i<ctx->layers_length; i++) 1560 { 1561 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i; 1562 body_size += cnt_layer_header->body_size; 1563 } 1564 uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM); 1565 const uint8_t *body_start_cache = ctx->body_start; 1566 memcpy(body_start_iomem, body_start_cache, body_size); 1567 for (int i=0; i<body_size; i++) 1568 { 1569 configASSERT(body_start_iomem[i] == body_start_cache[i]); 1570 } 1571 1572 } else 1573 { 1574 return -1; 1575 } 1576 1577 return 0; 1083 uintptr_t base_addr = (uintptr_t)buffer; 1084 const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer; 1085 1086 if (header->version == 3 && header->arch == 0) 1087 { 1088 ctx->model_buffer = buffer; 1089 ctx->output_count = header->output_count; 1090 ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t)); 1091 ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count); 1092 ctx->layers_length = header->layers_length; 1093 ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length); 1094 ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage); 1095 if (!ctx->main_buffer) 1096 return -1; 1097 uint32_t body_size = 0; 1098 for (int i = 0; i < ctx->layers_length; i++) 1099 { 1100 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i; 1101 body_size += cnt_layer_header->body_size; 1102 } 1103 uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM); 1104 const uint8_t *body_start_cache = ctx->body_start; 1105 memcpy(body_start_iomem, body_start_cache, body_size); 1106 for (int i = 0; i < body_size; i++) 1107 { 1108 configASSERT(body_start_iomem[i] == body_start_cache[i]); 1109 } 1110 } 1111 else 1112 { 1113 return -1; 1114 } 1115 1116 return 0; 1578 1117 } 1579 1118 1580 1119 int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size) 1581 1120 { 1582 if(ctx->is_nncase) 1583 return -1; 1584 1585 if(index >= ctx->output_count) 1586 return -1; 1587 1588 const kpu_model_output_t *output = ctx->outputs + index; 1589 *data = ctx->main_buffer + output->address; 1590 *size = output->size; 1591 return 0; 1121 if (index >= ctx->output_count) 1122 return -1; 1123 1124 const kpu_model_output_t *output = ctx->outputs + index; 1125 *data = ctx->main_buffer + output->address; 1126 *size = output->size; 1127 return 0; 1592 1128 } 1593 1129 1594 1130 void kpu_model_free(kpu_model_context_t *ctx) 1595 1131 { 1596 if(ctx->is_nncase) 1597 return; 1598 1599 free(ctx->main_buffer); 1600 ctx->main_buffer = NULL; 1132 free(ctx->main_buffer); 1133 ctx->main_buffer = NULL; 1601 1134 } 1602 1135 … … 1609 1142 static const char *str_layer_type(uint32_t type) 1610 1143 { 1611 switch(type)1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1144 switch (type) 1145 { 1146 case KL_ADD: 1147 return "Add"; 1148 case KL_QUANTIZED_ADD: 1149 return "QuantAdd"; 1150 case KL_GLOBAL_AVERAGE_POOL2D: 1151 return "GAP"; 1152 case KL_QUANTIZED_MAX_POOL2D: 1153 return "QuantMaxPool2d"; 1154 case KL_AVERAGE_POOL2D: 1155 return "AveragePool2d"; 1156 case KL_QUANTIZE: 1157 return "Quantize"; 1158 case KL_DEQUANTIZE: 1159 return "Dequantize"; 1160 case KL_REQUANTIZE: 1161 return "Requantize"; 1162 case KL_L2_NORMALIZATION: 1163 return "L2Norm"; 1164 case KL_SOFTMAX: 1165 return "Softmax"; 1166 case KL_CONCAT: 1167 return "Concat"; 1168 case KL_QUANTIZED_CONCAT: 1169 return "QuantConcat"; 1170 case KL_FULLY_CONNECTED: 1171 return "FullyConnected"; 1172 case KL_TENSORFLOW_FLATTEN: 1173 return "TFFlatten"; 1174 case KL_RESIZE_NEAREST_NEIGHBOR: 1175 return "ResizeNearestNeighbor"; 1176 case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR: 1177 return "QuantResizeNearestNeighbor"; 1178 case KL_CHANNELWISE_DEQUANTIZE: 1179 return "ChannelwiseDequantize"; 1180 case KL_LOGISTIC: 1181 return "Logistic"; 1182 case KL_K210_CONV: 1183 return "K210Conv"; 1184 case KL_K210_ADD_PADDING: 1185 return "K210AddPad"; 1186 case KL_K210_REMOVE_PADDING: 1187 return "K210RemovePad"; 1188 case KL_K210_UPLOAD: 1189 return "K210Upload"; 1190 default: 1191 return "Unknown"; 1192 } 1660 1193 } 1661 1194 #endif … … 1663 1196 static int kpu_kmodel_done(kpu_model_context_t *ctx) 1664 1197 { 1665 1666 1667 1668 1669 1670 1671 1672 1198 kpu->interrupt_clear.data = (kpu_config_interrupt_t){ 1199 .calc_done_int = 1, 1200 .layer_cfg_almost_empty_int = 1, 1201 .layer_cfg_almost_full_int = 1}; 1202 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1203 .calc_done_int = 1, 1204 .layer_cfg_almost_empty_int = 1, 1205 .layer_cfg_almost_full_int = 1}; 1673 1206 #if KPU_DEBUG 1674 uint32_t cnt_layer_id = ctx->current_layer - 1;1675 1676 if(last_time != 0)1677 1678 1679 syslog(LOG_NOTICE, "layer %d [%s]: %f ms", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);1680 1681 if(last_layer_type == KL_K210_CONV)1682 1683 1684 1685 syslog(LOG_NOTICE, "KPU: %f ms", kpu_time / 1000.0);1686 syslog(LOG_NOTICE, "CPU: %f ms", (total_time - kpu_time) / 1000.0);1687 syslog(LOG_NOTICE, "Model: %f ms", total_time / 1000.0);1207 uint32_t cnt_layer_id = ctx->current_layer; 1208 uint64_t time = sysctl_get_time_us(); 1209 if (last_time != 0) 1210 { 1211 uint64_t layer_time = time - last_time; 1212 syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000); 1213 total_time += layer_time; 1214 if (last_layer_type == KL_K210_CONV) 1215 kpu_time += layer_time; 1216 } 1217 1218 syslog(LOG_NOTICE, "KPU: %d.%03d ms", kpu_time / 1000, kpu_time % 1000); 1219 syslog(LOG_NOTICE, "CPU: %d.%03d ms", (total_time - kpu_time) / 1000, (total_time - kpu_time) % 1000); 1220 syslog(LOG_NOTICE, "Model: %d.%03d ms", total_time / 1000, total_time % 1000); 1688 1221 #endif 1689 1690 1222 ctx->done_callback(ctx->userdata); 1223 return 0; 1691 1224 } 1692 1225 1693 1226 static int ai_step(void *userdata) 1694 1227 { 1695 kpu_model_context_t *ctx = (kpu_model_context_t *)userdata; 1696 1697 uint32_t cnt_layer_id = ctx->current_layer; 1698 const uint8_t *layer_body = ctx->current_body; 1699 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id; 1700 if (cnt_layer_id >= ctx->layers_length) { 1701 //syslog(LOG_NOTICE, "overrun"); 1702 kpu_kmodel_done(ctx); 1703 return -1; 1704 } 1705 1706 ctx->current_layer++; 1707 ctx->current_body += cnt_layer_header->body_size; 1228 kpu_model_context_t *ctx = (kpu_model_context_t *)userdata; 1229 1230 uint32_t cnt_layer_id = ctx->current_layer; 1231 const uint8_t *layer_body = ctx->current_body; 1232 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id; 1233 if (cnt_layer_id >= ctx->layers_length) 1234 { 1235 //syslog(LOG_NOTICE, "overrun"); 1236 kpu_kmodel_done(ctx); 1237 return -1; 1238 } 1239 1240 ctx->current_layer++; 1241 ctx->current_body += cnt_layer_header->body_size; 1708 1242 1709 1243 #if KPU_DEBUG 1710 1711 if(last_time != 0)1712 1713 1714 1715 1716 if(last_layer_type == KL_K210_CONV)1717 1718 1719 1720 1721 1244 uint64_t time = sysctl_get_time_us(); 1245 if (last_time != 0) 1246 { 1247 uint64_t layer_time = time - last_time; 1248 syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000); 1249 total_time += layer_time; 1250 if (last_layer_type == KL_K210_CONV) 1251 kpu_time += layer_time; 1252 } 1253 1254 last_layer_type = cnt_layer_header->type; 1255 last_time = sysctl_get_time_us(); 1722 1256 #endif 1723 1257 1724 switch(cnt_layer_header->type)1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 if (ctx->current_layer < ctx->layers_length)1797 1798 1799 1800 1258 switch (cnt_layer_header->type) 1259 { 1260 case KL_ADD: 1261 kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx); 1262 break; 1263 case KL_QUANTIZED_ADD: 1264 kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx); 1265 break; 1266 case KL_GLOBAL_AVERAGE_POOL2D: 1267 kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx); 1268 break; 1269 case KL_QUANTIZED_MAX_POOL2D: 1270 kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx); 1271 break; 1272 case KL_AVERAGE_POOL2D: 1273 kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx); 1274 break; 1275 case KL_QUANTIZE: 1276 kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx); 1277 break; 1278 case KL_DEQUANTIZE: 1279 kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx); 1280 break; 1281 case KL_REQUANTIZE: 1282 kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx); 1283 break; 1284 case KL_L2_NORMALIZATION: 1285 kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx); 1286 break; 1287 case KL_SOFTMAX: 1288 kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx); 1289 break; 1290 case KL_CONCAT: 1291 case KL_QUANTIZED_CONCAT: 1292 kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx); 1293 break; 1294 case KL_FULLY_CONNECTED: 1295 kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx); 1296 break; 1297 case KL_TENSORFLOW_FLATTEN: 1298 kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx); 1299 break; 1300 case KL_RESIZE_NEAREST_NEIGHBOR: 1301 kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx); 1302 break; 1303 case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR: 1304 kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx); 1305 break; 1306 case KL_CHANNELWISE_DEQUANTIZE: 1307 kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx); 1308 break; 1309 case KL_LOGISTIC: 1310 kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx); 1311 break; 1312 case KL_K210_CONV: 1313 kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx); 1314 return 0; 1315 case KL_K210_ADD_PADDING: 1316 kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx); 1317 break; 1318 case KL_K210_REMOVE_PADDING: 1319 kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx); 1320 break; 1321 case KL_K210_UPLOAD: 1322 kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx); 1323 break; 1324 default: 1325 assert(!"Layer is not supported."); 1326 kpu_kmodel_done(ctx); 1327 return -1; 1328 } 1329 1330 if (ctx->current_layer < (ctx->layers_length - 1)) 1331 ai_step(userdata); 1332 else 1333 kpu_kmodel_done(ctx); 1334 return 0; 1801 1335 } 1802 1336 1803 1337 static void ai_step_not_isr(void *userdata) 1804 1338 { 1805 sysctl_disable_irq(); 1806 ai_step(userdata); 1807 sysctl_enable_irq(); 1339 dis_int(INTNO_DMAAI); 1340 dis_int(INTNO_AI); 1341 1342 ai_step(userdata); 1343 1344 ena_int(INTNO_DMAAI); 1345 ena_int(INTNO_AI); 1808 1346 } 1809 1347 1810 1348 int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata) 1811 1349 { 1812 if(ctx->is_nncase) 1813 return -1; 1814 1815 ctx->dma_ch = dma_ch; 1816 ctx->done_callback = done_callback; 1817 ctx->userdata = userdata; 1818 ctx->current_layer = 0; 1819 ctx->current_body = ctx->body_start; 1350 ctx->dma_ch = dma_ch; 1351 ctx->done_callback = done_callback; 1352 ctx->userdata = userdata; 1353 ctx->current_layer = 0; 1354 ctx->current_body = ctx->body_start; 1820 1355 #if KPU_DEBUG 1821 1822 1823 1356 last_time = 0; 1357 total_time = 0; 1358 kpu_time = 0; 1824 1359 #endif 1825 1360 1826 kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer; 1827 kpu->interrupt_clear.reg = 7; 1828 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){ 1829 .fifo_full_threshold = 10, .fifo_empty_threshold = 1}; 1830 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){ 1831 .eight_bit_mode = header->flags & 1}; 1832 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1833 .calc_done_int = 1, 1834 .layer_cfg_almost_empty_int = 0, 1835 .layer_cfg_almost_full_int = 1}; 1836 1837 plic_set_priority(INTNO_AI, 1); 1838 plic_irq_register(INTNO_AI, ai_step, ctx); 1839 plic_irq_enable(INTNO_AI); 1840 1841 const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers; 1842 1843 switch(first_layer_header->type) 1844 { 1845 case KL_K210_CONV: 1846 { 1847 const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start; 1848 kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset); 1849 1850 if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0) 1851 { 1852 kpu_kmodel_input_with_padding(&layer_arg, src); 1853 ai_step_not_isr(ctx); 1854 } else 1855 { 1856 kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx); 1857 } 1858 } 1859 break; 1860 case KL_FULLY_CONNECTED: 1861 { 1862 const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start; 1863 kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels); 1864 ai_step_not_isr(ctx); 1865 } 1866 break; 1867 default: 1868 return -1; 1869 } 1870 1871 return 0; 1872 } 1361 kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer; 1362 kpu->interrupt_clear.reg = 7; 1363 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){ 1364 .fifo_full_threshold = 10, .fifo_empty_threshold = 1}; 1365 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){ 1366 .eight_bit_mode = header->flags & 1}; 1367 kpu->interrupt_mask.data = (kpu_config_interrupt_t){ 1368 .calc_done_int = 1, 1369 .layer_cfg_almost_empty_int = 0, 1370 .layer_cfg_almost_full_int = 1}; 1371 1372 //plic_set_priority(INTNO_AI, 1); 1373 plic_irq_register(INTNO_AI, ai_step, ctx); 1374 plic_irq_enable(INTNO_AI); 1375 1376 const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers; 1377 1378 switch (first_layer_header->type) 1379 { 1380 case KL_K210_CONV: 1381 { 1382 const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start; 1383 kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset); 1384 1385 if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0) 1386 { 1387 kpu_kmodel_input_with_padding(&layer_arg, src); 1388 ai_step_not_isr(ctx); 1389 } 1390 else 1391 { 1392 kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx); 1393 } 1394 } 1395 break; 1396 case KL_FULLY_CONNECTED: 1397 { 1398 const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start; 1399 kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels); 1400 ai_step_not_isr(ctx); 1401 } 1402 break; 1403 default: 1404 return -1; 1405 } 1406 1407 return 0; 1408 } 1409 1410 ER kpu_init(kpu_model_context_t *ctx) 1411 { 1412 g_ai_hdma.chnum = AI_DMA_CH; 1413 g_ai_hdma.xfercallback = ai_dma_done_isr; 1414 g_ai_hdma.errorcallback = NULL; 1415 g_ai_hdma.Init.Request = DMA_SELECT_AI_RX_REQ; /* DMA選択 */ 1416 g_ai_hdma.Init.Direction = DMA_PERIPH_TO_MEMORY; /* DMA転送方向 */ 1417 g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT; /* ソースマルチブロックタイプ */ 1418 g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT; /* デスティネーションマルチブロックタイプ */ 1419 g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE; /* ソースハンドシェイク */ 1420 g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE; /* デスティネーションハンドシェイク */ 1421 g_ai_hdma.Init.SrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* ソースハードウェアハンドシェイク極性 */ 1422 g_ai_hdma.Init.DrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* デスティネーションハードウェアハンドシェイク極性 */ 1423 g_ai_hdma.Init.Priority = 4; /* 優先度 */ 1424 g_ai_hdma.Init.SrcMaster = DMAC_MASTER1; /* ソースマスター設定 */ 1425 g_ai_hdma.Init.DstMaster = DMAC_MASTER2; /* デスティネーションマスター設定 */ 1426 g_ai_hdma.Init.SrcInc = DMAC_ADDR_NOCHANGE; /* ソースインクリメント設定 */ 1427 g_ai_hdma.Init.DstInc = DMAC_ADDR_INCREMENT; /* デスティネーションインクリメント設定 */ 1428 g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32; /* ソース転送幅 */ 1429 g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32; /* デスティネーション転送幅 */ 1430 g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4; /* ソースバーストサイズ */ 1431 g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4; /* デスティネーションバーストサイズ */ 1432 g_ai_hdma.Init.IocBlkTrans = 0; /* IOCブロック転送 */ 1433 g_ai_hdma.localdata = (void *)ctx; 1434 1435 return dma_init(&g_ai_hdma); 1436 }
Note:
See TracChangeset
for help on using the changeset viewer.