source: azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c@ 453

Last change on this file since 453 was 453, checked in by coas-nagasima, 4 years ago

ファイルを追加

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 64.2 KB
Line 
1#include <assert.h>
2#include <float.h>
3#include <math.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <string.h>
7#include <stdint.h>
8#include <kernel.h>
9#include <t_syslog.h>
10#include <t_stdlib.h>
11#include <kernel_impl.h>
12#include <target_syssvc.h>
13#include "kendryte-k210.h"
14#include "device.h"
15#include "atomic.h"
16#include "kpu.h"
17#include "utils.h"
18#include "kpu_main.h"
19
20#define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b))
21
22void sysctl_enable_irq(void)
23{
24 set_csr(mie, MIP_MEIP);
25 set_csr(mstatus, MSTATUS_MIE);
26}
27
28void sysctl_disable_irq(void)
29{
30 clear_csr(mie, MIP_MEIP);
31 clear_csr(mstatus, MSTATUS_MIE);
32}
33
34uint64_t sysctl_get_time_us(void)
35{
36 uint64_t v_cycle = read_cycle();
37 return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
38}
39
40static int is_memory(uintptr_t address)
41{
42 enum
43 {
44 mem_len = 6 * 1024 * 1024,
45 mem_no_cache_len = 8 * 1024 * 1024,
46 };
47 return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040);
48}
49
50uint32_t is_memory_cache(uintptr_t address)
51{
52 #define MEM_CACHE_LEN (6 * 1024 * 1024)
53
54 return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
55}
56
57int plic_irq_enable(INTNO irq_number)
58{
59 if (irq_number != INTNO_AI)
60 return -1;
61 ena_int(irq_number);
62 return 0;
63}
64
65int plic_set_priority(INTNO irq_number, uint32_t priority)
66{
67 if (irq_number != INTNO_AI)
68 return -1;
69 set_ipriority(irq_number, priority);
70 return 0;
71}
72
73plic_irq_callback_t ai_done_callback;
74void *ai_done_ctx;
75
76void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
77{
78 ER ret;
79 if (irq != INTNO_AI)
80 return;
81
82 ret = loc_cpu();
83
84 ai_done_callback = callback;
85 ai_done_ctx = ctx;
86
87 if (ret == E_OK)
88 unl_cpu();
89}
90
91void ai_done_isr(intptr_t exinf)
92{
93 sysctl_disable_irq();
94 if (ai_done_callback != NULL){
95 ai_done_callback(ai_done_ctx);
96 }
97 sysctl_enable_irq();
98}
99
100plic_irq_callback_t ai_dma_done_callback;
101void *ai_dma_done_ctx;
102
103void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
104 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
105{
106 ER ret;
107 if (channel_num != AI_DMA_CH)
108 return;
109
110 set_ipriority(INTNO_DMAAI, priority);
111
112 ret = loc_cpu();
113
114 ai_dma_done_callback = dmac_callback;
115 ai_dma_done_ctx = ctx;
116
117 if (ret == E_OK)
118 unl_cpu();
119}
120
121void ai_dma_done_isr(DMA_Handle_t *dma)
122{
123 sysctl_disable_irq();
124 if (ai_dma_done_callback != NULL) {
125 ai_dma_done_callback(ai_dma_done_ctx);
126 }
127 sysctl_enable_irq();
128}
129
130void dmac_set_irq(dmac_channel_number_t channel_num,
131 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
132{
133 ER ret;
134 if (channel_num != AI_DMA_CH)
135 return;
136
137 set_ipriority(INTNO_DMAAI, priority);
138
139 ret = loc_cpu();
140
141 ai_dma_done_callback = dmac_callback;
142 ai_dma_done_ctx = ctx;
143
144 if (ret == E_OK)
145 unl_cpu();
146}
147
148DMA_Handle_t g_ai_hdma;
149
150void dmac_set_single_mode(dmac_channel_number_t channel_num,
151 const void *src, void *dest, uint8_t src_inc,
152 uint8_t dest_inc,
153 uint8_t dmac_burst_size,
154 uint8_t dmac_trans_width,
155 size_t block_size)
156{
157 if (channel_num != AI_DMA_CH)
158 return;
159
160 DMA_Handle_t *hdma = &g_ai_hdma;
161 int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
162 uint8_t flow_control;
163 if(mem_type_src == 0 && mem_type_dest == 0)
164 flow_control = DMA_PERIPH_TO_PERIPH;
165 else if(mem_type_src == 1 && mem_type_dest == 0)
166 flow_control = DMA_MEMORY_TO_PERIPH;
167 else if(mem_type_src == 0 && mem_type_dest == 1)
168 flow_control = DMA_PERIPH_TO_MEMORY;
169 else
170 flow_control = DMA_MEMORY_TO_MEMORY;
171
172 hdma->Init.Direction = flow_control; /* DMA転送方向 */
173 hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
174 hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
175 hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
176 hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
177 hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
178 hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
179 hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
180 hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
181 dma_reset(hdma);
182
183 dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
184}
185
186#define LAYER_BURST_SIZE 12
187
188#define KPU_DEBUG 0
189#define USE_CACHED_AI_RAM 0
190
191#define min(a, b) (((a) < (b)) ? (a) : (b))
192#define max(a, b) (((a) > (b)) ? (a) : (b))
193#define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
194
195static int ai_step(void *userdata);
196static int kpu_kmodel_done(kpu_model_context_t *ctx);
197
198volatile kpu_config_t *const kpu = (volatile kpu_config_t *)AI_BASE_ADDR;
199static volatile uint32_t kpu_status;
200
201typedef struct kpu_context
202{
203 kpu_task_t kpu_task;
204 uint32_t kpu_status;
205} kpu_context_t;
206
207volatile kpu_context_t g_kpu_context;
208
209static int kpu_run_all_done(void *_task)
210{
211 atomic_swap(&g_kpu_context.kpu_status, 0);
212 kpu_task_t *task = (kpu_task_t *)_task;
213 task->callback(task);
214 return 0;
215}
216
217int kpu_continue(void *_task)
218{
219 kpu_task_t *task = (kpu_task_t *)_task;
220 int layer_burst_size = 1;
221
222 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
223 .calc_done_int = 1,
224 .layer_cfg_almost_empty_int = 1,
225 .layer_cfg_almost_full_int = 1};
226
227 if(task->remain_layers_length == 0)
228 {
229 return 0;
230 }
231 if(task->remain_layers_length <= layer_burst_size)
232 {
233 for(uint32_t i = 0; i < task->remain_layers_length; i++)
234 {
235 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
236 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
237 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
238 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
239 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
240 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
241 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
242 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
243 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
244 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
245 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
246 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
247 }
248 task->remain_layers_length = 0;
249 } else
250 {
251 for(uint32_t i = 0; i < layer_burst_size; i++)
252 {
253 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
254 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
255 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
256 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
257 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
258 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
259 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
260 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
261 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
262 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
263 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
264 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
265 }
266 task->remain_layers += layer_burst_size;
267 task->remain_layers_length -= layer_burst_size;
268 }
269 return 0;
270}
271
272static int kpu_run_dma_output(uint32_t dma_ch, void *dst, uint32_t length, plic_irq_callback_t cb, void *_task)
273{
274 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
275 kpu_dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);
276 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
277 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
278 return 0;
279}
280
281static int kpu_run_dma_input_done_push_layers(void *_task)
282{
283 kpu_task_t *task = (kpu_task_t *)_task;
284 kpu->interrupt_clear.reg = 7;
285 dma_end(&g_ai_hdma);
286 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
287 .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
288 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
289 .eight_bit_mode = task->eight_bit_mode};
290
291 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
292
293 kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
294
295 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
296 .calc_done_int = 0,
297 .layer_cfg_almost_empty_int = 0,
298 .layer_cfg_almost_full_int = 1};
299 kpu_continue(task);
300 return 0;
301}
302
303static void kpu_run_dma_input(uint32_t dma_ch, const void *src, plic_irq_callback_t cb, void *_task)
304{
305 kpu_task_t *task = _task;
306 kpu_layer_argument_t *first_layer = &task->layers[0];
307 uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
308 kpu_dmac_irq_register(dma_ch, cb, _task, 1);
309 dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
310 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
311}
312
313int kpu_run(kpu_task_t *v_task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback)
314{
315 if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
316 return -1;
317
318 memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
319 kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;
320
321 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
322
323 uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
324
325 last_layer->dma_parameter.data.send_data_out = 1;
326 last_layer->interrupt_enabe.data.int_en = 1;
327
328 task->dma_ch = dma_ch;
329 task->dst = dest;
330 task->dst_length = output_size;
331 task->callback = callback;
332 task->remain_layers_length = task->layers_length;
333 task->remain_layers = task->layers;
334
335 plic_set_priority(INTNO_AI, 1);
336 plic_irq_register(INTNO_AI, kpu_continue, task);
337 plic_irq_enable(INTNO_AI);
338
339 kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);
340
341 return 0;
342}
343
344uint8_t *kpu_get_output_buf(kpu_task_t *task)
345{
346 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
347 size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
348 return malloc(output_size);
349}
350
351void kpu_release_output_buf(uint8_t *output_buf)
352{
353 if(output_buf != NULL)
354 free(output_buf);
355}
356
357static int kpu_done(void *ctx)
358{
359 atomic_swap(&kpu_status, 0);
360 kpu_task_t *task = (kpu_task_t *)ctx;
361 task->callback(task->ctx);
362 return 0;
363}
364
365static int kpu_config_input(void *ctx)
366{
367 kpu_task_t *task = (kpu_task_t *)ctx;
368 kpu->interrupt_clear.reg = 7;
369 if(task->remain_layers_length <= LAYER_BURST_SIZE)
370 {
371 for(uint32_t i = 0; i < task->remain_layers_length; i++)
372 {
373 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
374 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
375 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
376 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
377 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
378 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
379 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
380 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
381 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
382 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
383 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
384 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
385 }
386 task->remain_layers_length = 0;
387 kpu->interrupt_mask.reg = 7;
388 } else
389 {
390 for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)
391 {
392 kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
393 kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
394 kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
395 kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
396 kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
397 kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
398 kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
399 kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
400 kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
401 kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
402 kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
403 kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
404 }
405 task->remain_layers += LAYER_BURST_SIZE;
406 task->remain_layers_length -= LAYER_BURST_SIZE;
407 }
408 return 0;
409}
410
411static void kpu_data_output(kpu_task_t *task)
412{
413 select_dma_channel(task->dma_ch, DMA_SELECT_AI_RX_REQ);
414 kpu_dmac_irq_register(task->dma_ch, kpu_done, task, 1);
415 dmac_set_single_mode(task->dma_ch, (void *)(&kpu->fifo_data_out), (void *)(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
416 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);
417}
418
419static int kpu_data_ready(void *ctx)
420{
421 kpu_task_t *task = (kpu_task_t *)ctx;
422
423 dma_end(&g_ai_hdma);
424 kpu_data_output(task);
425
426 kpu->eight_bit_mode.reg = task->eight_bit_mode;
427 kpu->interrupt_mask.reg = 7;
428 kpu->interrupt_clear.reg = 7;
429 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
430 .fifo_full_threshold = 12, .fifo_empty_threshold = 1};
431
432 plic_set_priority(INTNO_AI, 2);
433 plic_irq_register(INTNO_AI, kpu_config_input, task);
434 plic_irq_enable(INTNO_AI);
435 kpu_config_input(task);
436 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
437 .calc_done_int = 1,
438 .layer_cfg_almost_empty_int = 0,
439 .layer_cfg_almost_full_int = 1};
440 return 0;
441}
442
443static void kpu_data_input(kpu_task_t *task)
444{
445 if(task->src == NULL)
446 {
447 kpu_data_ready(task);
448 return;
449 }
450 kpu_dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);
451 kpu_layer_argument_t *layer = &task->layers[0];
452 dmac_set_single_mode(task->dma_ch, (void *)(uintptr_t)task->src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
453 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);
454}
455
456int kpu_single_task_init(kpu_task_t *task)
457{
458 /*
459 * AIクロック有効化
460 */
461 sil_orw_mem((uint32_t *)(TADR_SYSCTL_BASE+TOFF_SYSCTL_CLK_EN_PERI), SYSCTL_CLK_EN_PERI_AI_CLK_EN);
462
463 kpu_layer_argument_t *first_layer = &task->layers[0];
464 kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
465
466 last_layer->dma_parameter.data.send_data_out = 1;
467 last_layer->interrupt_enabe.data.int_en = 1;
468 task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;
469 task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;
470 task->dst = (uint64_t *)malloc(task->dst_length * 8);
471 if(task->dst == NULL)
472 return 1;
473 memset(task->dst, 0, task->dst_length * 8);
474 return 0;
475}
476
477int kpu_single_task_deinit(kpu_task_t *task)
478{
479 free(task->dst);
480 return 0;
481}
482
483int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta)
484{
485 uintptr_t base_addr = (uintptr_t)buffer;
486 kpu_model_header_t *header = (kpu_model_header_t *)buffer;
487 kpu_model_layer_metadata_t *layer_meta = (kpu_model_layer_metadata_t *)(base_addr + sizeof(kpu_model_header_t));
488 kpu_layer_argument_t *layers = (kpu_layer_argument_t *)(base_addr + header->layers_argument_start);
489
490 if(header->version != 1)
491 return -1;
492 uint32_t layers_length = header->layers_length;
493 task->layers_length = layers_length;
494 task->eight_bit_mode = header->flags & 1;
495 task->layers = layers;
496 task->output_scale = layer_meta[layers_length - 1].output_scale;
497 task->output_bias = layer_meta[layers_length - 1].output_bias;
498 size_t i;
499 for(i = 0; i < layers_length; i++)
500 {
501 layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);
502 layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);
503 layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);
504 }
505
506 if(meta)
507 *meta = layer_meta;
508 return 0;
509}
510
511int kpu_start(kpu_task_t *task)
512{
513 if(atomic_cas(&kpu_status, 0, 1))
514 return -1;
515
516 task->remain_layers_length = task->layers_length;
517 task->remain_layers = task->layers;
518 kpu_data_input(task);
519 return 0;
520}
521
522static void kpu_send_layer(const kpu_layer_argument_t *layer)
523{
524 kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
525 kpu->layer_argument_fifo = layer->image_addr.reg;
526 kpu->layer_argument_fifo = layer->image_channel_num.reg;
527 kpu->layer_argument_fifo = layer->image_size.reg;
528 kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
529 kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
530 kpu->layer_argument_fifo = layer->kernel_offset.reg;
531 kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
532 kpu->layer_argument_fifo = layer->write_back_cfg.reg;
533 kpu->layer_argument_fifo = layer->conv_value.reg;
534 kpu->layer_argument_fifo = layer->conv_value2.reg;
535 kpu->layer_argument_fifo = layer->dma_parameter.reg;
536}
537
538void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
539{
540 kpu->interrupt_clear.reg = 7;
541 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
542 .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
543 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
544 .eight_bit_mode = eight_bit_mode};
545 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
546 .calc_done_int = 1,
547 .layer_cfg_almost_empty_int = 0,
548 .layer_cfg_almost_full_int = 1};
549
550 plic_set_priority(INTNO_AI, 1);
551 plic_irq_register(INTNO_AI, callback, userdata);
552 plic_irq_enable(INTNO_AI);
553}
554
555void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
556{
557 uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
558 dmac_set_irq(dma_ch, callback, userdata, 1);
559 dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
560 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
561}
562
563static void kpu_conv2d_core(kpu_layer_argument_t *layer)
564{
565 kpu_send_layer(layer);
566}
567
568void kpu_conv2d(kpu_layer_argument_t *layer)
569{
570 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
571 .calc_done_int = 1,
572 .layer_cfg_almost_empty_int = 1,
573 .layer_cfg_almost_full_int = 1};
574 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
575 .calc_done_int = 1,
576 .layer_cfg_almost_empty_int = 0,
577 .layer_cfg_almost_full_int = 1};
578 kpu_conv2d_core(layer);
579}
580
581void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
582{
583 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
584 .calc_done_int = 1,
585 .layer_cfg_almost_empty_int = 1,
586 .layer_cfg_almost_full_int = 1};
587 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
588 .calc_done_int = 1,
589 .layer_cfg_almost_empty_int = 1,
590 .layer_cfg_almost_full_int = 1};
591 layer->dma_parameter.data.send_data_out = 1;
592 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
593 dmac_set_irq(dma_ch, callback, userdata, 1);
594 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
595 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
596 kpu_conv2d_core(layer);
597}
598
599void kpu_conv2d_output_full_add(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint64_t *dest, plic_irq_callback_t callback, void *userdata)
600{
601 uint32_t channels = layer->image_channel_num.data.o_ch_num + 1;
602 layer->interrupt_enabe.data.full_add = 1;
603
604 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
605 .calc_done_int = 1,
606 .layer_cfg_almost_empty_int = 1,
607 .layer_cfg_almost_full_int = 1};
608 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
609 .calc_done_int = 1,
610 .layer_cfg_almost_empty_int = 1,
611 .layer_cfg_almost_full_int = 1};
612 layer->dma_parameter.data.send_data_out = 1;
613 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
614 dmac_set_irq(dma_ch, callback, userdata, 1);
615 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
616 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels);
617 kpu_conv2d_core(layer);
618}
619
620void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, size_t count, uint8_t *dest, const quantize_param_t *dest_param)
621{
622 quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param;
623
624 size_t i;
625 for(i = 0; i < count; i++)
626 {
627 int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale;
628 if(value < 0)
629 value = 0;
630 if(value > 0xFF)
631 value = 0xFF;
632 *dest++ = value;
633 }
634}
635
636void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
637{
638 quantize_param_t q1 = *src_param, q2 = *dest_param;
639 size_t oc, y, x;
640
641 if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
642 {
643 uint32_t row_padding = 16;
644 uint32_t row_group = 4;
645 uint32_t row_length = 1;
646 uint32_t height = 4;
647
648 for(oc = 0; oc < channels; oc++)
649 {
650 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
651 for(y = 0; y < 1; y++)
652 {
653 uint8_t *y_origin = channel_origin + y * row_length * 64;
654 for(x = 0; x < 1; x++)
655 {
656 int64_t sum = 0;
657 size_t i;
658 for(i = 0; i < kernel_size; i++)
659 sum += *src++;
660
661 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
662 if(value < 0)
663 value = 0;
664 if(value > 0xFF)
665 value = 0xFF;
666 y_origin[x] = value;
667 }
668 }
669 }
670 } else
671 {
672 for(oc = 0; oc < channels; oc++)
673 {
674 int64_t sum = 0;
675 size_t i;
676 for(i = 0; i < kernel_size; i++)
677 sum += *src++;
678
679 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
680 if(value < 0)
681 value = 0;
682 if(value > 0xFF)
683 value = 0xFF;
684 dest[oc] = value;
685 }
686 }
687}
688
689void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
690{
691 quantize_param_t q = *src_param;
692 size_t oc;
693
694 for(oc = 0; oc < channels; oc++)
695 {
696 int64_t sum = 0;
697 size_t i;
698 for(i = 0; i < kernel_size; i++)
699 sum += *src++;
700
701 float value = (sum * q.scale + q.bias) / kernel_size;
702 dest[oc] = value;
703 }
704}
705
706void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param)
707{
708 quantize_param_t q1 = *dest_param;
709 size_t i = 0;
710 for(i = 0; i < channels; i++)
711 *dest++ = src[i * 16] * q1.scale + q1.bias;
712}
713
714void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels)
715{
716 int ic, oc;
717 for(oc = 0; oc < output_channels; oc++)
718 {
719 const float *c_weights = weights + oc * input_channels;
720
721 float sum = 0.0f;
722 for(ic = 0; ic < input_channels; ic++)
723 sum += src[ic] * c_weights[ic];
724 dest[oc] = sum + biases[oc];
725 }
726}
727
728void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest)
729{
730 quantize_param_t q1 = *src_param;
731 size_t i = 0;
732 for(i = 0; i < count; i++)
733 *dest++ = src[i] * q1.scale + q1.bias;
734}
735
736void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels)
737{
738 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64);
739 size_t oc, y, x;
740
741 uint32_t row_padding;
742 uint32_t row_group;
743 uint32_t row_length;
744
745 if(width <= 16)
746 {
747 row_padding = 16;
748 row_group = 4;
749 row_length = 1;
750 } else if(width <= 32)
751 {
752 row_padding = 32;
753 row_group = 2;
754 row_length = 1;
755 } else
756 {
757 row_padding = 64;
758 row_group = 1;
759 row_length = (width + 63) / 64;
760 }
761
762 for(oc = 0; oc < channels; oc++)
763 {
764 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
765 for(y = 0; y < height; y++)
766 {
767 uint8_t *y_origin = channel_origin + y * row_length * 64;
768 for(x = 0; x < width; x++)
769 y_origin[x] = *src++;
770 }
771 }
772}
773#if USE_CACHED_AI_RAM
774static void kpu_flush_cache(uint32_t addr, size_t lines)
775{
776 size_t line;
777 for(line = 0; line < lines; line++)
778 {
779 const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
780 uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
781 size_t i;
782 for(i = 0; i < 8; i++)
783 dest[i] = src[i];
784 }
785}
786#endif
787static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
788{
789 if(shift > 0)
790 {
791 value >>= shift - 1;
792 if(value & 0x1)
793 {
794 if(value < 0)
795 value = (value >> 1) - 1;
796 else
797 value = (value >> 1) + 1;
798 } else
799 {
800 value >>= 1;
801 }
802 }
803
804 return value;
805}
806static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
807{
808 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
809 size_t oc, y, x;
810 uint32_t row_padding;
811 uint32_t row_group;
812 uint32_t row_length;
813 if(width <= 16)
814 {
815 row_padding = 16;
816 row_group = 4;
817 row_length = 1;
818 } else if(width <= 32)
819 {
820 row_padding = 32;
821 row_group = 2;
822 row_length = 1;
823 } else
824 {
825 row_padding = 64;
826 row_group = 1;
827 row_length = (width + 63) / 64;
828 }
829
830 if((uintptr_t)src % 8 == 0 && width % 8 == 0)
831 {
832#define UPLOAD_BEGIN() \
833 for(oc = 0; oc < channels; oc++) \
834 { \
835 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
836 for(y = 0; y < height; y++) \
837 { \
838 uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
839
840#define UPLOAD_END() \
841 } \
842 }
843
844 width /= 8;
845 const uint64_t *u64_src = (const uint64_t *)src;
846 if(width == 1)
847 {
848 UPLOAD_BEGIN()
849 y_origin[0] = *u64_src++;
850 UPLOAD_END()
851 } else if(width == 2)
852 {
853 UPLOAD_BEGIN()
854 {
855 y_origin[0] = *u64_src++;
856 y_origin[1] = *u64_src++;
857 }
858 UPLOAD_END()
859 } else if(width == 4)
860 {
861 UPLOAD_BEGIN()
862 {
863 y_origin[0] = *u64_src++;
864 y_origin[1] = *u64_src++;
865 y_origin[2] = *u64_src++;
866 y_origin[3] = *u64_src++;
867 }
868 UPLOAD_END()
869 } else
870 {
871 UPLOAD_BEGIN()
872 for(x = 0; x < width; x++)
873 y_origin[x] = *u64_src++;
874 UPLOAD_END()
875 }
876 } else
877 {
878 for(oc = 0; oc < channels; oc++)
879 {
880 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
881 for(y = 0; y < height; y++)
882 {
883 uint8_t *y_origin = channel_origin + y * row_length * 64;
884 for(x = 0; x < width; x++)
885 y_origin[x] = *src++;
886 }
887 }
888 }
889}
890static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
891{
892 size_t width = layer->image_size.data.i_row_wid + 1;
893 size_t height = layer->image_size.data.i_col_high + 1;
894 size_t channels = layer->image_channel_num.data.i_ch_num + 1;
895
896 kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
897}
898
899static void kpu_kmodel_input_float(const float *src, float *dest, size_t count)
900{
901 memcpy(dest, src, count * sizeof(float));
902}
903
904static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
905{
906 size_t i;
907
908 if(act == KLA_RELU)
909 {
910 for(i = 0; i < count; i++)
911 data[i] = max(data[i], 0);
912 } else if(act == KLA_RELU6)
913 {
914 for(i = 0; i < count; i++)
915 data[i] = min(max(data[i], 0), 6);
916 }
917}
918
919static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
920{
921 const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
922 const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
923 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
924 size_t i, count = arg->count;
925
926 for(i = 0; i < count; i++)
927 dest[i] = src_a[i] + src_b[i];
928}
929
930static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
931{
932 const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
933 const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
934 size_t count = ALIGN_UP(arg->count, 8) / 8;
935 int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
936 int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
937 int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
938
939 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
940 size_t i;
941
942 if(sh_a == sh_b)
943 {
944#define QADD_UNROLL_1(x) \
945 int64_t a##x = *src_a++; \
946 int64_t b##x = *src_b++;
947
948#define QADD_UNROLL_2(x) \
949 a##x += off_a; \
950 b##x += off_b;
951
952#define QADD_UNROLL_3(x) \
953 a##x *= mul_a; \
954 b##x *= mul_b;
955
956#define QADD_UNROLL_4(x) \
957 int64_t v##x = a##x + b##x;
958
959#define QADD_UNROLL_5(x) \
960 v##x >>= sh_a;
961
962#define QADD_UNROLL_6(x) \
963 v##x *= mul_o;
964
965#define QADD_UNROLL_7(x) \
966 v##x = kpu_carry_shift(v##x, sh_o);
967
968#define QADD_UNROLL_8(x) \
969 v##x += off_o;
970
971#define QADD_UNROLL_9(x) \
972 v##x = min(0xFF, max(0, v##x));
973
974#define QADD_UNROLL_10(x) \
975 *dest++ = v##x;
976
977#define QADD_UNROLL_S(x) \
978 QADD_UNROLL_##x(0) \
979 QADD_UNROLL_##x(1) \
980 QADD_UNROLL_##x(2) \
981 QADD_UNROLL_##x(3) \
982 QADD_UNROLL_##x(4) \
983 QADD_UNROLL_##x(5) \
984 QADD_UNROLL_##x(6) \
985 QADD_UNROLL_##x(7)
986
987 for(i = 0; i < count; i++)
988 {
989 QADD_UNROLL_S(1);
990 QADD_UNROLL_S(2);
991 QADD_UNROLL_S(3);
992 QADD_UNROLL_S(4);
993 QADD_UNROLL_S(5);
994 QADD_UNROLL_S(6);
995 QADD_UNROLL_S(7);
996 QADD_UNROLL_S(8);
997 QADD_UNROLL_S(9);
998 QADD_UNROLL_S(10);
999 }
1000 } else
1001 {
1002#undef QADD_UNROLL_1
1003#define QADD_UNROLL_1(x) \
1004 int64_t a##x = *src_a++; \
1005 int64_t b##x = *src_b++;
1006
1007#undef QADD_UNROLL_2
1008#define QADD_UNROLL_2(x) \
1009 a##x += off_a; \
1010 b##x += off_b;
1011
1012#undef QADD_UNROLL_3
1013#define QADD_UNROLL_3(x) \
1014 a##x *= mul_a; \
1015 b##x *= mul_b;
1016
1017#undef QADD_UNROLL_4
1018#define QADD_UNROLL_4(x) \
1019 a##x >>= sh_a; \
1020 b##x >>= sh_b;
1021
1022#undef QADD_UNROLL_5
1023#define QADD_UNROLL_5(x) \
1024 int64_t v##x = a##x + b##x;
1025
1026#undef QADD_UNROLL_6
1027#define QADD_UNROLL_6(x) \
1028 v##x *= mul_o;
1029
1030#undef QADD_UNROLL_7
1031#define QADD_UNROLL_7(x) \
1032 v##x = kpu_carry_shift(v##x, sh_o);
1033
1034#undef QADD_UNROLL_8
1035#define QADD_UNROLL_8(x) \
1036 v##x += off_o;
1037
1038#undef QADD_UNROLL_9
1039#define QADD_UNROLL_9(x) \
1040 v##x = min(0xFF, max(0, v##x));
1041
1042#undef QADD_UNROLL_10
1043#define QADD_UNROLL_10(x) \
1044 *dest++ = v##x;
1045
1046#undef QADD_UNROLL_S
1047#define QADD_UNROLL_S(x) \
1048 QADD_UNROLL_##x(0) \
1049 QADD_UNROLL_##x(1) \
1050 QADD_UNROLL_##x(2) \
1051 QADD_UNROLL_##x(3) \
1052 QADD_UNROLL_##x(4) \
1053 QADD_UNROLL_##x(5) \
1054 QADD_UNROLL_##x(6) \
1055 QADD_UNROLL_##x(7)
1056
1057 for(i = 0; i < count; i++)
1058 {
1059 QADD_UNROLL_S(1);
1060 QADD_UNROLL_S(2);
1061 QADD_UNROLL_S(3);
1062 QADD_UNROLL_S(4);
1063 QADD_UNROLL_S(5);
1064 QADD_UNROLL_S(6);
1065 QADD_UNROLL_S(7);
1066 QADD_UNROLL_S(8);
1067 QADD_UNROLL_S(9);
1068 QADD_UNROLL_S(10);
1069 }
1070 }
1071}
1072
1073static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
1074{
1075 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1076 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1077 size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
1078
1079 for(oc = 0; oc < channels; oc++)
1080 {
1081 float sum = 0.f;
1082 size_t i;
1083 for(i = 0; i < kernel_size; i++)
1084 sum += *src++;
1085
1086 dest[oc] = sum / kernel_size;
1087 }
1088}
1089
1090static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
1091{
1092 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
1093 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
1094 kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
1095 uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
1096 uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
1097 uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
1098
1099 uint32_t out_y, out_x, oc;
1100
1101 for(oc = 0; oc < out_shape.channels; oc++)
1102 {
1103 const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
1104 for(out_y = 0; out_y < out_shape.height; out_y++)
1105 {
1106 for(out_x = 0; out_x < out_shape.width; out_x++)
1107 {
1108 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
1109 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
1110 int32_t kernel_x_start = max(0, -in_x_origin);
1111 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
1112 int32_t kernel_y_start = max(0, -in_y_origin);
1113 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
1114 uint8_t value = 0;
1115
1116 int32_t kernel_y, kernel_x;
1117 for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
1118 {
1119 for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
1120 {
1121 int32_t in_x = in_x_origin + kernel_x;
1122 int32_t in_y = in_y_origin + kernel_y;
1123 value = max(value, channel_src[in_y * in_shape.width + in_x]);
1124 }
1125 }
1126
1127 *dest++ = value;
1128 }
1129 }
1130 }
1131}
1132
1133static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
1134{
1135 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1136 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1137 kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
1138 uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
1139 uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
1140 uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
1141
1142 uint32_t out_y, out_x, oc;
1143
1144 for(oc = 0; oc < out_shape.channels; oc++)
1145 {
1146 const float *channel_src = src + in_shape.width * in_shape.height * oc;
1147 for(out_y = 0; out_y < out_shape.height; out_y++)
1148 {
1149 for(out_x = 0; out_x < out_shape.width; out_x++)
1150 {
1151 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
1152 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
1153 int32_t kernel_x_start = max(0, -in_x_origin);
1154 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
1155 int32_t kernel_y_start = max(0, -in_y_origin);
1156 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
1157 float value = 0;
1158 float kernel_count = 0;
1159
1160 int32_t kernel_y, kernel_x;
1161 for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
1162 {
1163 for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
1164 {
1165 int32_t in_x = in_x_origin + kernel_x;
1166 int32_t in_y = in_y_origin + kernel_y;
1167 value += channel_src[in_y * in_shape.width + in_x];
1168 kernel_count++;
1169 }
1170 }
1171
1172 *dest++ = value / kernel_count;
1173 }
1174 }
1175 }
1176}
1177
1178static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
1179{
1180 size_t count = arg->count;
1181 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1182
1183 kpu_model_quant_param_t q = arg->quant_param;
1184
1185 float scale = 1.f / q.scale;
1186
1187 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
1188 size_t i;
1189 for(i = 0; i < count; i++)
1190 {
1191 int value = roundf((*src++ - q.bias) * scale);
1192 if(value < 0)
1193 value = 0;
1194 if(value > 0xFF)
1195 value = 0xFF;
1196 *dest++ = (uint8_t)value;
1197 }
1198}
1199
1200static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
1201{
1202 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
1203 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1204 size_t oc, count = arg->count;
1205 kpu_model_quant_param_t q = arg->quant_param;
1206
1207 for(oc = 0; oc < count; oc++)
1208 dest[oc] = *src++ * q.scale + q.bias;
1209}
1210
1211static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
1212{
1213 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
1214 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1215 size_t oc, i, channels = arg->channels, count = arg->channel_size;
1216
1217 for(oc = 0; oc < channels; oc++)
1218 {
1219 const kpu_model_quant_param_t q = arg->quant_params[oc];
1220
1221 for(i = 0; i < count; i++)
1222 *dest++ = *src++ * q.scale + q.bias;
1223 }
1224}
1225
1226static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
1227{
1228 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
1229 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
1230 size_t oc, count = arg->count;
1231 const uint8_t *table = arg->table;
1232
1233 if(false && count % 8 == 0)
1234 {
1235 for(oc = 0; oc < count;)
1236 {
1237 dest[oc++] = table[*src++];
1238 dest[oc++] = table[*src++];
1239 dest[oc++] = table[*src++];
1240 dest[oc++] = table[*src++];
1241 dest[oc++] = table[*src++];
1242 dest[oc++] = table[*src++];
1243 dest[oc++] = table[*src++];
1244 dest[oc++] = table[*src++];
1245 }
1246 } else
1247 {
1248 for(oc = 0; oc < count; oc++)
1249 dest[oc] = table[src[oc]];
1250 }
1251}
1252
1253static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
1254{
1255 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1256 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1257 size_t oc, channels = arg->channels;
1258
1259 float sum = 0.f;
1260 const float epsilon = 1e-10f;
1261 for(oc = 0; oc < channels; oc++)
1262 sum += src[oc] * src[oc];
1263 if(sum < epsilon)
1264 sum = epsilon;
1265 sum = 1.f / sqrtf(sum);
1266 for(oc = 0; oc < channels; oc++)
1267 dest[oc] = src[oc] * sum;
1268}
1269
1270static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
1271{
1272 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1273 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1274 size_t oc, channels = arg->channels;
1275
1276 float max = FLT_MIN;
1277 for(oc = 0; oc < channels; oc++)
1278 max = fmaxf(max, src[oc]);
1279
1280 float sum = 0.f;
1281 for(oc = 0; oc < channels; oc++)
1282 {
1283 float value = expf(src[oc] - max);
1284 sum += value;
1285 dest[oc] = value;
1286 }
1287
1288 for(oc = 0; oc < channels; oc++)
1289 dest[oc] /= sum;
1290}
1291
1292static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
1293{
1294 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
1295 uint32_t count = arg->input_count, i;
1296
1297 for(i = 0; i < count; i++)
1298 {
1299 kpu_model_memory_range_t input = arg->inputs_mem[i];
1300 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
1301 memcpy(dest, src, input.size);
1302 dest += input.size;
1303 }
1304}
1305
1306static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
1307{
1308 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1309 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1310 uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
1311 float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
1312 float *bias = (float *)malloc(out_channels * sizeof(float));
1313 memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
1314 memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
1315
1316 if(in_channels % 8 == 0)
1317 {
1318#define FC_UNROLL_1(x) \
1319 float i##x = *c_src++; \
1320 float w##x = *c_weights++;
1321
1322#define FC_UNROLL_2(x) \
1323 sum += i##x * w##x;
1324
1325#define FC_UNROLL_S(x) \
1326 FC_UNROLL_##x(0) \
1327 FC_UNROLL_##x(1) \
1328 FC_UNROLL_##x(2) \
1329 FC_UNROLL_##x(3) \
1330 FC_UNROLL_##x(4) \
1331 FC_UNROLL_##x(5) \
1332 FC_UNROLL_##x(6) \
1333 FC_UNROLL_##x(7)
1334
1335 for(oc = 0; oc < out_channels; oc++)
1336 {
1337 const float *c_src = src;
1338 const float *c_weights = weights + oc * in_channels;
1339
1340 float sum = 0.0f;
1341 for(ic = 0; ic < in_channels / 8; ic++)
1342 {
1343 FC_UNROLL_S(1);
1344 FC_UNROLL_S(2);
1345 }
1346
1347 dest[oc] = sum + bias[oc];
1348 }
1349 } else
1350 {
1351 for(oc = 0; oc < out_channels; oc++)
1352 {
1353 const float *c_weights = weights + oc * in_channels;
1354
1355 float sum = 0.0f;
1356 for(ic = 0; ic < in_channels; ic++)
1357 sum += src[ic] * c_weights[ic];
1358 dest[oc] = sum + bias[oc];
1359 }
1360 }
1361 free(weights);
1362 free(bias);
1363 kpu_float_activation(dest, out_channels, arg->act);
1364}
1365
1366static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
1367{
1368 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1369 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1370 kpu_model_shape_t in_shape = arg->shape;
1371 uint32_t oc, oy, ox;
1372
1373 for(oy = 0; oy < in_shape.height; oy++)
1374 for(ox = 0; ox < in_shape.width; ox++)
1375 for(oc = 0; oc < in_shape.channels; oc++)
1376 *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
1377}
1378
1379static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
1380{
1381 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1382 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1383 kpu_model_shape_t in_shape = arg->in_shape;
1384 uint32_t out_width = arg->out_width, out_height = arg->out_height;
1385 uint32_t oc, oy, ox;
1386
1387 float height_scale = (float)in_shape.height / out_height;
1388 float width_scale = (float)in_shape.width / out_width;
1389
1390 for(oc = 0; oc < in_shape.channels; oc++)
1391 {
1392 const float *channel_src = src + in_shape.width * in_shape.height * oc;
1393 for(oy = 0; oy < out_height; oy++)
1394 {
1395 uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
1396 const float *y_origin = channel_src + in_y * in_shape.width;
1397 for(ox = 0; ox < out_width; ox++)
1398 {
1399 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
1400 *dest++ = y_origin[in_x];
1401 }
1402 }
1403 }
1404}
1405
1406static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
1407{
1408 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
1409 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
1410 kpu_model_shape_t in_shape = arg->in_shape;
1411 uint32_t out_width = arg->out_width, out_height = arg->out_height;
1412 uint32_t oc, oy, ox;
1413
1414 float height_scale = (float)in_shape.height / out_height;
1415 float width_scale = (float)in_shape.width / out_width;
1416
1417 for(oc = 0; oc < in_shape.channels; oc++)
1418 {
1419 const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
1420 for(oy = 0; oy < out_height; oy++)
1421 {
1422 uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
1423 const uint8_t *y_origin = channel_src + in_y * in_shape.width;
1424 for(ox = 0; ox < out_width; ox++)
1425 {
1426 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
1427 *dest++ = y_origin[in_x];
1428 }
1429 }
1430 }
1431}
1432
1433static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx)
1434{
1435 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
1436 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
1437 size_t oc, channels = arg->channels;
1438
1439 for(oc = 0; oc < channels; oc++)
1440 dest[oc] = 1.f / (1.f + expf(-src[oc]));
1441}
1442
1443static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
1444{
1445 volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
1446 layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
1447 layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
1448 layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
1449
1450 if(arg->flags & KLF_MAIN_MEM_OUT)
1451 {
1452 dmac_channel_number_t dma_ch = ctx->dma_ch;
1453 uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
1454 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1455 .calc_done_int = 1,
1456 .layer_cfg_almost_empty_int = 1,
1457 .layer_cfg_almost_full_int = 1};
1458 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1459 .calc_done_int = 1,
1460 .layer_cfg_almost_empty_int = 1,
1461 .layer_cfg_almost_full_int = 1};
1462 layer.dma_parameter.data.send_data_out = 1;
1463 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
1464 if(ctx->current_layer < ctx->layers_length)
1465 dmac_set_irq(dma_ch, ai_step, ctx, 1);
1466 else
1467 dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
1468 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
1469 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
1470 } else
1471 {
1472 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1473 .calc_done_int = 1,
1474 .layer_cfg_almost_empty_int = 1,
1475 .layer_cfg_almost_full_int = 1};
1476
1477 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1478 .calc_done_int = 0,
1479 .layer_cfg_almost_empty_int = 1,
1480 .layer_cfg_almost_full_int = 1};
1481 layer.interrupt_enabe.data.int_en = 1;
1482 }
1483
1484 kpu_send_layer((const kpu_layer_argument_t *)&layer);
1485}
1486
1487static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
1488{
1489 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
1490#if USE_CACHED_AI_RAM
1491 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
1492#else
1493 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
1494#endif
1495
1496 uint32_t row_padding = 16;
1497 uint32_t row_group = 4;
1498 uint32_t row_length = 1;
1499 uint32_t height = 4;
1500 uint32_t oc, x, y, channels = arg->channels;
1501
1502 for(oc = 0; oc < channels; oc++)
1503 {
1504 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
1505 for(y = 0; y < 1; y++)
1506 {
1507 uint8_t *y_origin = channel_origin + y * row_length * 64;
1508 for(x = 0; x < 1; x++)
1509 y_origin[x] = *src++;
1510 }
1511 }
1512
1513#if USE_CACHED_AI_RAM
1514 uint32_t lines = row_length * height * channels / row_group;
1515 kpu_flush_cache(arg->kpu_mem_out_address, lines);
1516#endif
1517}
1518
1519static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
1520{
1521 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
1522 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
1523 uint32_t oc, channels = arg->channels;
1524
1525 for(oc = 0; oc < channels; oc++)
1526 *dest++ = src[oc * 16];
1527}
1528
1529static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
1530{
1531 size_t width = arg->width;
1532 size_t height = arg->height;
1533 size_t channels = arg->channels;
1534
1535 kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
1536}
1537
1538int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
1539{
1540#if FIX_CACHE
1541 configASSERT(is_memory_cache((uintptr_t)buffer));
1542#endif
1543 uintptr_t base_addr = (uintptr_t)buffer;
1544 const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
1545
1546 if (header->version == 3 && header->arch == 0)
1547 {
1548 ctx->is_nncase = 0;
1549 ctx->model_buffer = buffer;
1550 ctx->output_count = header->output_count;
1551 ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
1552 ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
1553 ctx->layers_length = header->layers_length;
1554 ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
1555 ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
1556 if (!ctx->main_buffer)
1557 return -1;
1558 uint32_t body_size = 0;
1559 for (int i=0; i<ctx->layers_length; i++)
1560 {
1561 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
1562 body_size += cnt_layer_header->body_size;
1563 }
1564 uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
1565 const uint8_t *body_start_cache = ctx->body_start;
1566 memcpy(body_start_iomem, body_start_cache, body_size);
1567 for (int i=0; i<body_size; i++)
1568 {
1569 configASSERT(body_start_iomem[i] == body_start_cache[i]);
1570 }
1571
1572 } else
1573 {
1574 return -1;
1575 }
1576
1577 return 0;
1578}
1579
1580int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
1581{
1582 if(ctx->is_nncase)
1583 return -1;
1584
1585 if(index >= ctx->output_count)
1586 return -1;
1587
1588 const kpu_model_output_t *output = ctx->outputs + index;
1589 *data = ctx->main_buffer + output->address;
1590 *size = output->size;
1591 return 0;
1592}
1593
1594void kpu_model_free(kpu_model_context_t *ctx)
1595{
1596 if(ctx->is_nncase)
1597 return;
1598
1599 free(ctx->main_buffer);
1600 ctx->main_buffer = NULL;
1601}
1602
1603#if KPU_DEBUG
1604static uint64_t last_time;
1605static uint64_t total_time;
1606static uint64_t kpu_time;
1607static uint32_t last_layer_type;
1608
1609static const char *str_layer_type(uint32_t type)
1610{
1611 switch(type)
1612 {
1613 case KL_ADD:
1614 return "Add";
1615 case KL_QUANTIZED_ADD:
1616 return "QuantAdd";
1617 case KL_GLOBAL_AVERAGE_POOL2D:
1618 return "GAP";
1619 case KL_QUANTIZED_MAX_POOL2D:
1620 return "QuantMaxPool2d";
1621 case KL_AVERAGE_POOL2D:
1622 return "AveragePool2d";
1623 case KL_QUANTIZE:
1624 return "Quantize";
1625 case KL_DEQUANTIZE:
1626 return "Dequantize";
1627 case KL_REQUANTIZE:
1628 return "Requantize";
1629 case KL_L2_NORMALIZATION:
1630 return "L2Norm";
1631 case KL_SOFTMAX:
1632 return "Softmax";
1633 case KL_CONCAT:
1634 return "Concat";
1635 case KL_QUANTIZED_CONCAT:
1636 return "QuantConcat";
1637 case KL_FULLY_CONNECTED:
1638 return "FullyConnected";
1639 case KL_TENSORFLOW_FLATTEN:
1640 return "TFFlatten";
1641 case KL_RESIZE_NEAREST_NEIGHBOR:
1642 return "ResizeNearestNeighbor";
1643 case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
1644 return "QuantResizeNearestNeighbor";
1645 case KL_CHANNELWISE_DEQUANTIZE:
1646 return "ChannelwiseDequantize";
1647 case KL_LOGISTIC:
1648 return "Logistic";
1649 case KL_K210_CONV:
1650 return "K210Conv";
1651 case KL_K210_ADD_PADDING:
1652 return "K210AddPad";
1653 case KL_K210_REMOVE_PADDING:
1654 return "K210RemovePad";
1655 case KL_K210_UPLOAD:
1656 return "K210Upload";
1657 default:
1658 return "Unknown";
1659 }
1660}
1661#endif
1662
1663static int kpu_kmodel_done(kpu_model_context_t *ctx)
1664{
1665 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1666 .calc_done_int = 1,
1667 .layer_cfg_almost_empty_int = 1,
1668 .layer_cfg_almost_full_int = 1};
1669 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1670 .calc_done_int = 1,
1671 .layer_cfg_almost_empty_int = 1,
1672 .layer_cfg_almost_full_int = 1};
1673#if KPU_DEBUG
1674 uint32_t cnt_layer_id = ctx->current_layer - 1;
1675 uint64_t time = sysctl_get_time_us();
1676 if(last_time != 0)
1677 {
1678 uint64_t layer_time = time - last_time;
1679 syslog(LOG_NOTICE, "layer %d [%s]: %f ms", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);
1680 total_time += layer_time;
1681 if(last_layer_type == KL_K210_CONV)
1682 kpu_time += layer_time;
1683 }
1684
1685 syslog(LOG_NOTICE, "KPU: %f ms", kpu_time / 1000.0);
1686 syslog(LOG_NOTICE, "CPU: %f ms", (total_time - kpu_time) / 1000.0);
1687 syslog(LOG_NOTICE, "Model: %f ms", total_time / 1000.0);
1688#endif
1689 ctx->done_callback(ctx->userdata);
1690 return 0;
1691}
1692
1693static int ai_step(void *userdata)
1694{
1695 kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
1696
1697 uint32_t cnt_layer_id = ctx->current_layer;
1698 const uint8_t *layer_body = ctx->current_body;
1699 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
1700 if (cnt_layer_id >= ctx->layers_length) {
1701 //syslog(LOG_NOTICE, "overrun");
1702 kpu_kmodel_done(ctx);
1703 return -1;
1704 }
1705
1706 ctx->current_layer++;
1707 ctx->current_body += cnt_layer_header->body_size;
1708
1709#if KPU_DEBUG
1710 uint64_t time = sysctl_get_time_us();
1711 if(last_time != 0)
1712 {
1713 uint64_t layer_time = time - last_time;
1714 syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
1715 total_time += layer_time;
1716 if(last_layer_type == KL_K210_CONV)
1717 kpu_time += layer_time;
1718 }
1719
1720 last_layer_type = cnt_layer_header->type;
1721 last_time = sysctl_get_time_us();
1722#endif
1723
1724 switch(cnt_layer_header->type)
1725 {
1726 case KL_ADD:
1727 kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
1728 break;
1729 case KL_QUANTIZED_ADD:
1730 kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
1731 break;
1732 case KL_GLOBAL_AVERAGE_POOL2D:
1733 kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
1734 break;
1735 case KL_QUANTIZED_MAX_POOL2D:
1736 kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
1737 break;
1738 case KL_AVERAGE_POOL2D:
1739 kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
1740 break;
1741 case KL_QUANTIZE:
1742 kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
1743 break;
1744 case KL_DEQUANTIZE:
1745 kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
1746 break;
1747 case KL_REQUANTIZE:
1748 kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
1749 break;
1750 case KL_L2_NORMALIZATION:
1751 kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
1752 break;
1753 case KL_SOFTMAX:
1754 kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
1755 break;
1756 case KL_CONCAT:
1757 case KL_QUANTIZED_CONCAT:
1758 kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
1759 break;
1760 case KL_FULLY_CONNECTED:
1761 kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
1762 break;
1763 case KL_TENSORFLOW_FLATTEN:
1764 kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
1765 break;
1766 case KL_RESIZE_NEAREST_NEIGHBOR:
1767 kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
1768 break;
1769 case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
1770 kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
1771 break;
1772 case KL_CHANNELWISE_DEQUANTIZE:
1773 kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
1774 break;
1775 case KL_LOGISTIC:
1776 kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
1777 break;
1778 case KL_K210_CONV:
1779 kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
1780 return 0;
1781 case KL_K210_ADD_PADDING:
1782 kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
1783 break;
1784 case KL_K210_REMOVE_PADDING:
1785 kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
1786 break;
1787 case KL_K210_UPLOAD:
1788 kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
1789 break;
1790 default:
1791 assert(!"Layer is not supported.");
1792 kpu_kmodel_done(ctx);
1793 return -1;
1794 }
1795
1796 if (ctx->current_layer < ctx->layers_length)
1797 ai_step(userdata);
1798 else
1799 kpu_kmodel_done(ctx);
1800 return 0;
1801}
1802
1803static void ai_step_not_isr(void *userdata)
1804{
1805 sysctl_disable_irq();
1806 ai_step(userdata);
1807 sysctl_enable_irq();
1808}
1809
1810int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
1811{
1812 if(ctx->is_nncase)
1813 return -1;
1814
1815 ctx->dma_ch = dma_ch;
1816 ctx->done_callback = done_callback;
1817 ctx->userdata = userdata;
1818 ctx->current_layer = 0;
1819 ctx->current_body = ctx->body_start;
1820#if KPU_DEBUG
1821 last_time = 0;
1822 total_time = 0;
1823 kpu_time = 0;
1824#endif
1825
1826 kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
1827 kpu->interrupt_clear.reg = 7;
1828 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
1829 .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
1830 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
1831 .eight_bit_mode = header->flags & 1};
1832 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1833 .calc_done_int = 1,
1834 .layer_cfg_almost_empty_int = 0,
1835 .layer_cfg_almost_full_int = 1};
1836
1837 plic_set_priority(INTNO_AI, 1);
1838 plic_irq_register(INTNO_AI, ai_step, ctx);
1839 plic_irq_enable(INTNO_AI);
1840
1841 const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
1842
1843 switch(first_layer_header->type)
1844 {
1845 case KL_K210_CONV:
1846 {
1847 const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
1848 kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
1849
1850 if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
1851 {
1852 kpu_kmodel_input_with_padding(&layer_arg, src);
1853 ai_step_not_isr(ctx);
1854 } else
1855 {
1856 kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
1857 }
1858 }
1859 break;
1860 case KL_FULLY_CONNECTED:
1861 {
1862 const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
1863 kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
1864 ai_step_not_isr(ctx);
1865 }
1866 break;
1867 default:
1868 return -1;
1869 }
1870
1871 return 0;
1872}
Note: See TracBrowser for help on using the repository browser.