source: azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c@ 458

Last change on this file since 458 was 458, checked in by coas-nagasima, 4 years ago

SPIとSerial、KPUの動作を改善

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-csrc;charset=UTF-8
File size: 42.6 KB
RevLine 
[453]1#include <assert.h>
2#include <float.h>
3#include <math.h>
4#include <stdio.h>
5#include <stdlib.h>
6#include <string.h>
7#include <stdint.h>
8#include <kernel.h>
9#include <t_syslog.h>
10#include <t_stdlib.h>
11#include <kernel_impl.h>
12#include <target_syssvc.h>
13#include "kendryte-k210.h"
14#include "device.h"
15#include "atomic.h"
16#include "kpu.h"
17#include "utils.h"
18#include "kpu_main.h"
[458]19#include "kernel_cfg.h"
[453]20
21#define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b))
22
23uint64_t sysctl_get_time_us(void)
24{
[458]25 uint64_t v_cycle = read_cycle();
26 return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
[453]27}
28
29static int is_memory(uintptr_t address)
30{
[458]31 enum
32 {
33 mem_len = 6 * 1024 * 1024,
34 mem_no_cache_len = 8 * 1024 * 1024,
35 };
36 return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040);
[453]37}
38
39uint32_t is_memory_cache(uintptr_t address)
40{
[458]41#define MEM_CACHE_LEN (6 * 1024 * 1024)
[453]42
[458]43 return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
[453]44}
45
46int plic_irq_enable(INTNO irq_number)
47{
[458]48 if (irq_number != INTNO_AI)
49 return -1;
50 ena_int(irq_number);
51 return 0;
[453]52}
53
54int plic_set_priority(INTNO irq_number, uint32_t priority)
55{
[458]56 if (irq_number != INTNO_AI)
57 return -1;
58 set_ipriority(irq_number, priority);
59 return 0;
[453]60}
61
62plic_irq_callback_t ai_done_callback;
63void *ai_done_ctx;
64
65void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
66{
[458]67 if (irq != INTNO_AI)
68 return;
[453]69
[458]70 dis_int(INTNO_AI);
[453]71
[458]72 ai_done_callback = callback;
73 ai_done_ctx = ctx;
[453]74
[458]75 ena_int(INTNO_AI);
[453]76}
77
78void ai_done_isr(intptr_t exinf)
79{
[458]80 dis_int(INTNO_AI);
81 if (ai_done_callback != NULL)
82 {
83 ai_done_callback(ai_done_ctx);
84 }
85 ena_int(INTNO_AI);
[453]86}
87
88plic_irq_callback_t ai_dma_done_callback;
89void *ai_dma_done_ctx;
90
91void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
[458]92 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
[453]93{
[458]94 if (channel_num != AI_DMA_CH)
95 return;
[453]96
[458]97 //set_ipriority(INTNO_DMAAI, priority);
[453]98
[458]99 dis_int(INTNO_DMAAI);
[453]100
[458]101 ai_dma_done_callback = dmac_callback;
102 ai_dma_done_ctx = ctx;
[453]103
[458]104 ena_int(INTNO_DMAAI);
[453]105}
106
107void ai_dma_done_isr(DMA_Handle_t *dma)
108{
[458]109 dis_int(INTNO_DMAAI);
110
111 if (ai_dma_done_callback != NULL)
112 {
113 ai_dma_done_callback(ai_dma_done_ctx);
114 }
115
116 ena_int(INTNO_DMAAI);
[453]117}
118
119void dmac_set_irq(dmac_channel_number_t channel_num,
[458]120 plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
[453]121{
[458]122 if (channel_num != AI_DMA_CH)
123 return;
[453]124
[458]125 //set_ipriority(INTNO_DMAAI, priority);
[453]126
[458]127 dis_int(INTNO_DMAAI);
[453]128
[458]129 ai_dma_done_callback = dmac_callback;
130 ai_dma_done_ctx = ctx;
[453]131
[458]132 ena_int(INTNO_DMAAI);
[453]133}
134
135DMA_Handle_t g_ai_hdma;
136
137void dmac_set_single_mode(dmac_channel_number_t channel_num,
[458]138 const void *src, void *dest, uint8_t src_inc,
139 uint8_t dest_inc,
140 uint8_t dmac_burst_size,
141 uint8_t dmac_trans_width,
142 size_t block_size)
[453]143{
[458]144 if (channel_num != AI_DMA_CH)
145 return;
[453]146
[458]147 DMA_Handle_t *hdma = &g_ai_hdma;
148 int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
149 uint8_t flow_control;
150 if (mem_type_src == 0 && mem_type_dest == 0)
151 flow_control = DMA_PERIPH_TO_PERIPH;
152 else if (mem_type_src == 1 && mem_type_dest == 0)
153 flow_control = DMA_MEMORY_TO_PERIPH;
154 else if (mem_type_src == 0 && mem_type_dest == 1)
155 flow_control = DMA_PERIPH_TO_MEMORY;
156 else
157 flow_control = DMA_MEMORY_TO_MEMORY;
[453]158
[458]159 hdma->Init.Direction = flow_control; /* DMA転送方向 */
160 hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
161 hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
162 hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
163 hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
164 hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
165 hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
166 hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
167 hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
168 dma_reset(hdma);
[453]169
[458]170 dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
[453]171}
172
173#define LAYER_BURST_SIZE 12
174
175#define KPU_DEBUG 0
176#define USE_CACHED_AI_RAM 0
177
178#define min(a, b) (((a) < (b)) ? (a) : (b))
179#define max(a, b) (((a) > (b)) ? (a) : (b))
180#define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
181
182static int ai_step(void *userdata);
183static int kpu_kmodel_done(kpu_model_context_t *ctx);
184
185volatile kpu_config_t *const kpu = (volatile kpu_config_t *)AI_BASE_ADDR;
186static volatile uint32_t kpu_status;
187
188static void kpu_send_layer(const kpu_layer_argument_t *layer)
189{
[458]190 kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
191 kpu->layer_argument_fifo = layer->image_addr.reg;
192 kpu->layer_argument_fifo = layer->image_channel_num.reg;
193 kpu->layer_argument_fifo = layer->image_size.reg;
194 kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
195 kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
196 kpu->layer_argument_fifo = layer->kernel_offset.reg;
197 kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
198 kpu->layer_argument_fifo = layer->write_back_cfg.reg;
199 kpu->layer_argument_fifo = layer->conv_value.reg;
200 kpu->layer_argument_fifo = layer->conv_value2.reg;
201 kpu->layer_argument_fifo = layer->dma_parameter.reg;
[453]202}
203
204void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
205{
[458]206 uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
207 dmac_set_irq(dma_ch, callback, userdata, 1);
208 dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
209 DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
[453]210}
211
212static void kpu_conv2d_core(kpu_layer_argument_t *layer)
213{
[458]214 kpu_send_layer(layer);
[453]215}
216
217void kpu_conv2d(kpu_layer_argument_t *layer)
218{
[458]219 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
220 .calc_done_int = 1,
221 .layer_cfg_almost_empty_int = 1,
222 .layer_cfg_almost_full_int = 1};
223 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
224 .calc_done_int = 1,
225 .layer_cfg_almost_empty_int = 0,
226 .layer_cfg_almost_full_int = 1};
227 kpu_conv2d_core(layer);
[453]228}
229
230void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
231{
[458]232 quantize_param_t q1 = *src_param, q2 = *dest_param;
233 size_t oc, y, x;
[453]234
[458]235 if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
236 {
237 uint32_t row_padding = 16;
238 uint32_t row_group = 4;
239 uint32_t row_length = 1;
240 uint32_t height = 4;
[453]241
[458]242 for (oc = 0; oc < channels; oc++)
243 {
244 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
245 for (y = 0; y < 1; y++)
246 {
247 uint8_t *y_origin = channel_origin + y * row_length * 64;
248 for (x = 0; x < 1; x++)
249 {
250 int64_t sum = 0;
251 size_t i;
252 for (i = 0; i < kernel_size; i++)
253 sum += *src++;
[453]254
[458]255 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
256 if (value < 0)
257 value = 0;
258 if (value > 0xFF)
259 value = 0xFF;
260 y_origin[x] = value;
261 }
262 }
263 }
264 }
265 else
266 {
267 for (oc = 0; oc < channels; oc++)
268 {
269 int64_t sum = 0;
270 size_t i;
271 for (i = 0; i < kernel_size; i++)
272 sum += *src++;
[453]273
[458]274 int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
275 if (value < 0)
276 value = 0;
277 if (value > 0xFF)
278 value = 0xFF;
279 dest[oc] = value;
280 }
281 }
[453]282}
283
284void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
285{
[458]286 quantize_param_t q = *src_param;
287 size_t oc;
[453]288
[458]289 for (oc = 0; oc < channels; oc++)
290 {
291 int64_t sum = 0;
292 size_t i;
293 for (i = 0; i < kernel_size; i++)
294 sum += *src++;
[453]295
[458]296 float value = (sum * q.scale + q.bias) / kernel_size;
297 dest[oc] = value;
298 }
[453]299}
300
301#if USE_CACHED_AI_RAM
302static void kpu_flush_cache(uint32_t addr, size_t lines)
303{
[458]304 size_t line;
305 for (line = 0; line < lines; line++)
306 {
307 const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
308 uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
309 size_t i;
310 for (i = 0; i < 8; i++)
311 dest[i] = src[i];
312 }
[453]313}
314#endif
315static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
316{
[458]317 if (shift > 0)
318 {
319 value >>= shift - 1;
320 if (value & 0x1)
321 {
322 if (value < 0)
323 value = (value >> 1) - 1;
324 else
325 value = (value >> 1) + 1;
326 }
327 else
328 {
329 value >>= 1;
330 }
331 }
[453]332
[458]333 return value;
[453]334}
335static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
336{
[458]337 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
338 size_t oc, y, x;
339 uint32_t row_padding;
340 uint32_t row_group;
341 uint32_t row_length;
342 if (width <= 16)
343 {
344 row_padding = 16;
345 row_group = 4;
346 row_length = 1;
347 }
348 else if (width <= 32)
349 {
350 row_padding = 32;
351 row_group = 2;
352 row_length = 1;
353 }
354 else
355 {
356 row_padding = 64;
357 row_group = 1;
358 row_length = (width + 63) / 64;
359 }
[453]360
[458]361 if ((uintptr_t)src % 8 == 0 && width % 8 == 0)
362 {
[453]363#define UPLOAD_BEGIN() \
[458]364 for (oc = 0; oc < channels; oc++) \
365 { \
366 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
367 for (y = 0; y < height; y++) \
368 { \
369 uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
[453]370
371#define UPLOAD_END() \
[458]372 } \
373 }
[453]374
[458]375 width /= 8;
376 const uint64_t *u64_src = (const uint64_t *)src;
377 if (width == 1)
378 {
379 UPLOAD_BEGIN()
380 y_origin[0] = *u64_src++;
381 UPLOAD_END()
382 }
383 else if (width == 2)
384 {
385 UPLOAD_BEGIN()
386 {
387 y_origin[0] = *u64_src++;
388 y_origin[1] = *u64_src++;
389 }
390 UPLOAD_END()
391 }
392 else if (width == 4)
393 {
394 UPLOAD_BEGIN()
395 {
396 y_origin[0] = *u64_src++;
397 y_origin[1] = *u64_src++;
398 y_origin[2] = *u64_src++;
399 y_origin[3] = *u64_src++;
400 }
401 UPLOAD_END()
402 }
403 else
404 {
405 UPLOAD_BEGIN()
406 for (x = 0; x < width; x++)
407 y_origin[x] = *u64_src++;
408 UPLOAD_END()
409 }
410 }
411 else
412 {
413 for (oc = 0; oc < channels; oc++)
414 {
415 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
416 for (y = 0; y < height; y++)
417 {
418 uint8_t *y_origin = channel_origin + y * row_length * 64;
419 for (x = 0; x < width; x++)
420 y_origin[x] = *src++;
421 }
422 }
423 }
[453]424}
425static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
426{
[458]427 size_t width = layer->image_size.data.i_row_wid + 1;
428 size_t height = layer->image_size.data.i_col_high + 1;
429 size_t channels = layer->image_channel_num.data.i_ch_num + 1;
[453]430
[458]431 kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
[453]432}
433
434static void kpu_kmodel_input_float(const float *src, float *dest, size_t count)
435{
[458]436 memcpy(dest, src, count * sizeof(float));
[453]437}
438
439static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
440{
[458]441 size_t i;
[453]442
[458]443 if (act == KLA_RELU)
444 {
445 for (i = 0; i < count; i++)
446 data[i] = max(data[i], 0);
447 }
448 else if (act == KLA_RELU6)
449 {
450 for (i = 0; i < count; i++)
451 data[i] = min(max(data[i], 0), 6);
452 }
[453]453}
454
455static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
456{
[458]457 const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
458 const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
459 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
460 size_t i, count = arg->count;
[453]461
[458]462 for (i = 0; i < count; i++)
463 dest[i] = src_a[i] + src_b[i];
[453]464}
465
466static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
467{
[458]468 const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
469 const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
470 size_t count = ALIGN_UP(arg->count, 8) / 8;
471 int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
472 int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
473 int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
[453]474
[458]475 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
476 size_t i;
[453]477
[458]478 if (sh_a == sh_b)
479 {
[453]480#define QADD_UNROLL_1(x) \
[458]481 int64_t a##x = *src_a++; \
482 int64_t b##x = *src_b++;
[453]483
484#define QADD_UNROLL_2(x) \
[458]485 a##x += off_a; \
486 b##x += off_b;
[453]487
488#define QADD_UNROLL_3(x) \
[458]489 a##x *= mul_a; \
490 b##x *= mul_b;
[453]491
492#define QADD_UNROLL_4(x) \
[458]493 int64_t v##x = a##x + b##x;
[453]494
495#define QADD_UNROLL_5(x) \
[458]496 v##x >>= sh_a;
[453]497
498#define QADD_UNROLL_6(x) \
[458]499 v##x *= mul_o;
[453]500
501#define QADD_UNROLL_7(x) \
[458]502 v##x = kpu_carry_shift(v##x, sh_o);
[453]503
504#define QADD_UNROLL_8(x) \
[458]505 v##x += off_o;
[453]506
507#define QADD_UNROLL_9(x) \
[458]508 v##x = min(0xFF, max(0, v##x));
[453]509
510#define QADD_UNROLL_10(x) \
[458]511 *dest++ = v##x;
[453]512
513#define QADD_UNROLL_S(x) \
[458]514 QADD_UNROLL_##x(0) \
515 QADD_UNROLL_##x(1) \
516 QADD_UNROLL_##x(2) \
517 QADD_UNROLL_##x(3) \
518 QADD_UNROLL_##x(4) \
519 QADD_UNROLL_##x(5) \
520 QADD_UNROLL_##x(6) \
521 QADD_UNROLL_##x(7)
[453]522
[458]523 for (i = 0; i < count; i++)
524 {
525 QADD_UNROLL_S(1);
526 QADD_UNROLL_S(2);
527 QADD_UNROLL_S(3);
528 QADD_UNROLL_S(4);
529 QADD_UNROLL_S(5);
530 QADD_UNROLL_S(6);
531 QADD_UNROLL_S(7);
532 QADD_UNROLL_S(8);
533 QADD_UNROLL_S(9);
534 QADD_UNROLL_S(10);
535 }
536 }
537 else
538 {
[453]539#undef QADD_UNROLL_1
540#define QADD_UNROLL_1(x) \
[458]541 int64_t a##x = *src_a++; \
542 int64_t b##x = *src_b++;
[453]543
544#undef QADD_UNROLL_2
545#define QADD_UNROLL_2(x) \
[458]546 a##x += off_a; \
547 b##x += off_b;
[453]548
549#undef QADD_UNROLL_3
550#define QADD_UNROLL_3(x) \
[458]551 a##x *= mul_a; \
552 b##x *= mul_b;
[453]553
554#undef QADD_UNROLL_4
555#define QADD_UNROLL_4(x) \
[458]556 a##x >>= sh_a; \
557 b##x >>= sh_b;
[453]558
559#undef QADD_UNROLL_5
560#define QADD_UNROLL_5(x) \
[458]561 int64_t v##x = a##x + b##x;
[453]562
563#undef QADD_UNROLL_6
564#define QADD_UNROLL_6(x) \
[458]565 v##x *= mul_o;
[453]566
567#undef QADD_UNROLL_7
568#define QADD_UNROLL_7(x) \
[458]569 v##x = kpu_carry_shift(v##x, sh_o);
[453]570
571#undef QADD_UNROLL_8
572#define QADD_UNROLL_8(x) \
[458]573 v##x += off_o;
[453]574
575#undef QADD_UNROLL_9
576#define QADD_UNROLL_9(x) \
[458]577 v##x = min(0xFF, max(0, v##x));
[453]578
579#undef QADD_UNROLL_10
580#define QADD_UNROLL_10(x) \
[458]581 *dest++ = v##x;
[453]582
583#undef QADD_UNROLL_S
584#define QADD_UNROLL_S(x) \
[458]585 QADD_UNROLL_##x(0) \
586 QADD_UNROLL_##x(1) \
587 QADD_UNROLL_##x(2) \
588 QADD_UNROLL_##x(3) \
589 QADD_UNROLL_##x(4) \
590 QADD_UNROLL_##x(5) \
591 QADD_UNROLL_##x(6) \
592 QADD_UNROLL_##x(7)
[453]593
[458]594 for (i = 0; i < count; i++)
595 {
596 QADD_UNROLL_S(1);
597 QADD_UNROLL_S(2);
598 QADD_UNROLL_S(3);
599 QADD_UNROLL_S(4);
600 QADD_UNROLL_S(5);
601 QADD_UNROLL_S(6);
602 QADD_UNROLL_S(7);
603 QADD_UNROLL_S(8);
604 QADD_UNROLL_S(9);
605 QADD_UNROLL_S(10);
606 }
607 }
[453]608}
609
610static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
611{
[458]612 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
613 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
614 size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
[453]615
[458]616 for (oc = 0; oc < channels; oc++)
617 {
618 float sum = 0.f;
619 size_t i;
620 for (i = 0; i < kernel_size; i++)
621 sum += *src++;
[453]622
[458]623 dest[oc] = sum / kernel_size;
624 }
[453]625}
626
627static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
628{
[458]629 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
630 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
631 kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
632 uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
633 uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
634 uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
[453]635
[458]636 uint32_t out_y, out_x, oc;
[453]637
[458]638 for (oc = 0; oc < out_shape.channels; oc++)
639 {
640 const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
641 for (out_y = 0; out_y < out_shape.height; out_y++)
642 {
643 for (out_x = 0; out_x < out_shape.width; out_x++)
644 {
645 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
646 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
647 int32_t kernel_x_start = max(0, -in_x_origin);
648 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
649 int32_t kernel_y_start = max(0, -in_y_origin);
650 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
651 uint8_t value = 0;
[453]652
[458]653 int32_t kernel_y, kernel_x;
654 for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
655 {
656 for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
657 {
658 int32_t in_x = in_x_origin + kernel_x;
659 int32_t in_y = in_y_origin + kernel_y;
660 value = max(value, channel_src[in_y * in_shape.width + in_x]);
661 }
662 }
[453]663
[458]664 *dest++ = value;
665 }
666 }
667 }
[453]668}
669
670static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
671{
[458]672 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
673 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
674 kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
675 uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
676 uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
677 uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
[453]678
[458]679 uint32_t out_y, out_x, oc;
[453]680
[458]681 for (oc = 0; oc < out_shape.channels; oc++)
682 {
683 const float *channel_src = src + in_shape.width * in_shape.height * oc;
684 for (out_y = 0; out_y < out_shape.height; out_y++)
685 {
686 for (out_x = 0; out_x < out_shape.width; out_x++)
687 {
688 int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
689 int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
690 int32_t kernel_x_start = max(0, -in_x_origin);
691 int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
692 int32_t kernel_y_start = max(0, -in_y_origin);
693 int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
694 float value = 0;
695 float kernel_count = 0;
[453]696
[458]697 int32_t kernel_y, kernel_x;
698 for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
699 {
700 for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
701 {
702 int32_t in_x = in_x_origin + kernel_x;
703 int32_t in_y = in_y_origin + kernel_y;
704 value += channel_src[in_y * in_shape.width + in_x];
705 kernel_count++;
706 }
707 }
[453]708
[458]709 *dest++ = value / kernel_count;
710 }
711 }
712 }
[453]713}
714
715static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
716{
[458]717 size_t count = arg->count;
718 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
[453]719
[458]720 kpu_model_quant_param_t q = arg->quant_param;
[453]721
[458]722 float scale = 1.f / q.scale;
[453]723
[458]724 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
725 size_t i;
726 for (i = 0; i < count; i++)
727 {
728 int value = roundf((*src++ - q.bias) * scale);
729 if (value < 0)
730 value = 0;
731 if (value > 0xFF)
732 value = 0xFF;
733 *dest++ = (uint8_t)value;
734 }
[453]735}
736
737static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
738{
[458]739 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
740 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
741 size_t oc, count = arg->count;
742 kpu_model_quant_param_t q = arg->quant_param;
[453]743
[458]744 for (oc = 0; oc < count; oc++)
745 dest[oc] = *src++ * q.scale + q.bias;
[453]746}
747
748static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
749{
[458]750 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
751 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
752 size_t oc, i, channels = arg->channels, count = arg->channel_size;
[453]753
[458]754 for (oc = 0; oc < channels; oc++)
755 {
756 const kpu_model_quant_param_t q = arg->quant_params[oc];
[453]757
[458]758 for (i = 0; i < count; i++)
759 *dest++ = *src++ * q.scale + q.bias;
760 }
[453]761}
762
763static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
764{
[458]765 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
766 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
767 size_t oc, count = arg->count;
768 const uint8_t *table = arg->table;
[453]769
[458]770 if (false && count % 8 == 0)
771 {
772 for (oc = 0; oc < count;)
773 {
774 dest[oc++] = table[*src++];
775 dest[oc++] = table[*src++];
776 dest[oc++] = table[*src++];
777 dest[oc++] = table[*src++];
778 dest[oc++] = table[*src++];
779 dest[oc++] = table[*src++];
780 dest[oc++] = table[*src++];
781 dest[oc++] = table[*src++];
782 }
783 }
784 else
785 {
786 for (oc = 0; oc < count; oc++)
787 dest[oc] = table[src[oc]];
788 }
[453]789}
790
791static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
792{
[458]793 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
794 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
795 size_t oc, channels = arg->channels;
[453]796
[458]797 float sum = 0.f;
798 const float epsilon = 1e-10f;
799 for (oc = 0; oc < channels; oc++)
800 sum += src[oc] * src[oc];
801 if (sum < epsilon)
802 sum = epsilon;
803 sum = 1.f / sqrtf(sum);
804 for (oc = 0; oc < channels; oc++)
805 dest[oc] = src[oc] * sum;
[453]806}
807
808static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
809{
[458]810 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
811 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
812 size_t oc, channels = arg->channels;
[453]813
[458]814 float max = FLT_MIN;
815 for (oc = 0; oc < channels; oc++)
816 max = fmaxf(max, src[oc]);
[453]817
[458]818 float sum = 0.f;
819 for (oc = 0; oc < channels; oc++)
820 {
821 float value = expf(src[oc] - max);
822 sum += value;
823 dest[oc] = value;
824 }
[453]825
[458]826 for (oc = 0; oc < channels; oc++)
827 dest[oc] /= sum;
[453]828}
829
830static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
831{
[458]832 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
833 uint32_t count = arg->input_count, i;
[453]834
[458]835 for (i = 0; i < count; i++)
836 {
837 kpu_model_memory_range_t input = arg->inputs_mem[i];
838 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
839 memcpy(dest, src, input.size);
840 dest += input.size;
841 }
[453]842}
843
844static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
845{
[458]846 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
847 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
848 uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
849 float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
850 float *bias = (float *)malloc(out_channels * sizeof(float));
851 memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
852 memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
[453]853
[458]854 if (in_channels % 8 == 0)
855 {
[453]856#define FC_UNROLL_1(x) \
[458]857 float i##x = *c_src++; \
858 float w##x = *c_weights++;
[453]859
860#define FC_UNROLL_2(x) \
[458]861 sum += i##x * w##x;
[453]862
863#define FC_UNROLL_S(x) \
[458]864 FC_UNROLL_##x(0) \
865 FC_UNROLL_##x(1) \
866 FC_UNROLL_##x(2) \
867 FC_UNROLL_##x(3) \
868 FC_UNROLL_##x(4) \
869 FC_UNROLL_##x(5) \
870 FC_UNROLL_##x(6) \
871 FC_UNROLL_##x(7)
[453]872
[458]873 for (oc = 0; oc < out_channels; oc++)
874 {
875 const float *c_src = src;
876 const float *c_weights = weights + oc * in_channels;
[453]877
[458]878 float sum = 0.0f;
879 for (ic = 0; ic < in_channels / 8; ic++)
880 {
881 FC_UNROLL_S(1);
882 FC_UNROLL_S(2);
883 }
[453]884
[458]885 dest[oc] = sum + bias[oc];
886 }
887 }
888 else
889 {
890 for (oc = 0; oc < out_channels; oc++)
891 {
892 const float *c_weights = weights + oc * in_channels;
[453]893
[458]894 float sum = 0.0f;
895 for (ic = 0; ic < in_channels; ic++)
896 sum += src[ic] * c_weights[ic];
897 dest[oc] = sum + bias[oc];
898 }
899 }
900 free(weights);
901 free(bias);
902 kpu_float_activation(dest, out_channels, arg->act);
[453]903}
904
905static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
906{
[458]907 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
908 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
909 kpu_model_shape_t in_shape = arg->shape;
910 uint32_t oc, oy, ox;
[453]911
[458]912 for (oy = 0; oy < in_shape.height; oy++)
913 for (ox = 0; ox < in_shape.width; ox++)
914 for (oc = 0; oc < in_shape.channels; oc++)
915 *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
[453]916}
917
918static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
919{
[458]920 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
921 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
922 kpu_model_shape_t in_shape = arg->in_shape;
923 uint32_t out_width = arg->out_width, out_height = arg->out_height;
924 uint32_t oc, oy, ox;
[453]925
[458]926 float height_scale = (float)in_shape.height / out_height;
927 float width_scale = (float)in_shape.width / out_width;
[453]928
[458]929 for (oc = 0; oc < in_shape.channels; oc++)
930 {
931 const float *channel_src = src + in_shape.width * in_shape.height * oc;
932 for (oy = 0; oy < out_height; oy++)
933 {
934 uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
935 const float *y_origin = channel_src + in_y * in_shape.width;
936 for (ox = 0; ox < out_width; ox++)
937 {
938 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
939 *dest++ = y_origin[in_x];
940 }
941 }
942 }
[453]943}
944
945static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
946{
[458]947 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
948 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
949 kpu_model_shape_t in_shape = arg->in_shape;
950 uint32_t out_width = arg->out_width, out_height = arg->out_height;
951 uint32_t oc, oy, ox;
[453]952
[458]953 float height_scale = (float)in_shape.height / out_height;
954 float width_scale = (float)in_shape.width / out_width;
[453]955
[458]956 for (oc = 0; oc < in_shape.channels; oc++)
957 {
958 const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
959 for (oy = 0; oy < out_height; oy++)
960 {
961 uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
962 const uint8_t *y_origin = channel_src + in_y * in_shape.width;
963 for (ox = 0; ox < out_width; ox++)
964 {
965 uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
966 *dest++ = y_origin[in_x];
967 }
968 }
969 }
[453]970}
971
972static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx)
973{
[458]974 const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
975 float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
976 size_t oc, channels = arg->channels;
[453]977
[458]978 for (oc = 0; oc < channels; oc++)
979 dest[oc] = 1.f / (1.f + expf(-src[oc]));
[453]980}
981
982static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
983{
[458]984 volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
985 layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
986 layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
987 layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
[453]988
[458]989 if (arg->flags & KLF_MAIN_MEM_OUT)
990 {
991 dmac_channel_number_t dma_ch = ctx->dma_ch;
992 uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
993 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
994 .calc_done_int = 1,
995 .layer_cfg_almost_empty_int = 1,
996 .layer_cfg_almost_full_int = 1};
997 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
998 .calc_done_int = 1,
999 .layer_cfg_almost_empty_int = 1,
1000 .layer_cfg_almost_full_int = 1};
1001 layer.dma_parameter.data.send_data_out = 1;
1002 select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
1003 if (ctx->current_layer < ctx->layers_length)
1004 dmac_set_irq(dma_ch, ai_step, ctx, 1);
1005 else
1006 dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
1007 dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
1008 DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
1009 }
1010 else
1011 {
1012 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1013 .calc_done_int = 1,
1014 .layer_cfg_almost_empty_int = 1,
1015 .layer_cfg_almost_full_int = 1};
[453]1016
[458]1017 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1018 .calc_done_int = 0,
1019 .layer_cfg_almost_empty_int = 1,
1020 .layer_cfg_almost_full_int = 1};
1021 layer.interrupt_enabe.data.int_en = 1;
1022 }
[453]1023
[458]1024 kpu_send_layer((const kpu_layer_argument_t *)&layer);
[453]1025}
1026
1027static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
1028{
[458]1029 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
[453]1030#if USE_CACHED_AI_RAM
[458]1031 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
[453]1032#else
[458]1033 uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
[453]1034#endif
1035
[458]1036 uint32_t row_padding = 16;
1037 uint32_t row_group = 4;
1038 uint32_t row_length = 1;
1039 uint32_t height = 4;
1040 uint32_t oc, x, y, channels = arg->channels;
[453]1041
[458]1042 for (oc = 0; oc < channels; oc++)
1043 {
1044 uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
1045 for (y = 0; y < 1; y++)
1046 {
1047 uint8_t *y_origin = channel_origin + y * row_length * 64;
1048 for (x = 0; x < 1; x++)
1049 y_origin[x] = *src++;
1050 }
1051 }
[453]1052
1053#if USE_CACHED_AI_RAM
[458]1054 uint32_t lines = row_length * height * channels / row_group;
1055 kpu_flush_cache(arg->kpu_mem_out_address, lines);
[453]1056#endif
1057}
1058
1059static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
1060{
[458]1061 const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
1062 uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
1063 uint32_t oc, channels = arg->channels;
[453]1064
[458]1065 for (oc = 0; oc < channels; oc++)
1066 *dest++ = src[oc * 16];
[453]1067}
1068
1069static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
1070{
[458]1071 size_t width = arg->width;
1072 size_t height = arg->height;
1073 size_t channels = arg->channels;
[453]1074
[458]1075 kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
[453]1076}
1077
1078int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
1079{
1080#if FIX_CACHE
[458]1081 configASSERT(is_memory_cache((uintptr_t)buffer));
[453]1082#endif
[458]1083 uintptr_t base_addr = (uintptr_t)buffer;
1084 const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
[453]1085
[458]1086 if (header->version == 3 && header->arch == 0)
1087 {
1088 ctx->model_buffer = buffer;
1089 ctx->output_count = header->output_count;
1090 ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
1091 ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
1092 ctx->layers_length = header->layers_length;
1093 ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
1094 ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
1095 if (!ctx->main_buffer)
1096 return -1;
1097 uint32_t body_size = 0;
1098 for (int i = 0; i < ctx->layers_length; i++)
1099 {
1100 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
1101 body_size += cnt_layer_header->body_size;
1102 }
1103 uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
1104 const uint8_t *body_start_cache = ctx->body_start;
1105 memcpy(body_start_iomem, body_start_cache, body_size);
1106 for (int i = 0; i < body_size; i++)
1107 {
1108 configASSERT(body_start_iomem[i] == body_start_cache[i]);
1109 }
1110 }
1111 else
1112 {
1113 return -1;
1114 }
[453]1115
[458]1116 return 0;
[453]1117}
1118
1119int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
1120{
[458]1121 if (index >= ctx->output_count)
1122 return -1;
[453]1123
[458]1124 const kpu_model_output_t *output = ctx->outputs + index;
1125 *data = ctx->main_buffer + output->address;
1126 *size = output->size;
1127 return 0;
[453]1128}
1129
1130void kpu_model_free(kpu_model_context_t *ctx)
1131{
[458]1132 free(ctx->main_buffer);
1133 ctx->main_buffer = NULL;
[453]1134}
1135
1136#if KPU_DEBUG
1137static uint64_t last_time;
1138static uint64_t total_time;
1139static uint64_t kpu_time;
1140static uint32_t last_layer_type;
1141
1142static const char *str_layer_type(uint32_t type)
1143{
[458]1144 switch (type)
1145 {
1146 case KL_ADD:
1147 return "Add";
1148 case KL_QUANTIZED_ADD:
1149 return "QuantAdd";
1150 case KL_GLOBAL_AVERAGE_POOL2D:
1151 return "GAP";
1152 case KL_QUANTIZED_MAX_POOL2D:
1153 return "QuantMaxPool2d";
1154 case KL_AVERAGE_POOL2D:
1155 return "AveragePool2d";
1156 case KL_QUANTIZE:
1157 return "Quantize";
1158 case KL_DEQUANTIZE:
1159 return "Dequantize";
1160 case KL_REQUANTIZE:
1161 return "Requantize";
1162 case KL_L2_NORMALIZATION:
1163 return "L2Norm";
1164 case KL_SOFTMAX:
1165 return "Softmax";
1166 case KL_CONCAT:
1167 return "Concat";
1168 case KL_QUANTIZED_CONCAT:
1169 return "QuantConcat";
1170 case KL_FULLY_CONNECTED:
1171 return "FullyConnected";
1172 case KL_TENSORFLOW_FLATTEN:
1173 return "TFFlatten";
1174 case KL_RESIZE_NEAREST_NEIGHBOR:
1175 return "ResizeNearestNeighbor";
1176 case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
1177 return "QuantResizeNearestNeighbor";
1178 case KL_CHANNELWISE_DEQUANTIZE:
1179 return "ChannelwiseDequantize";
1180 case KL_LOGISTIC:
1181 return "Logistic";
1182 case KL_K210_CONV:
1183 return "K210Conv";
1184 case KL_K210_ADD_PADDING:
1185 return "K210AddPad";
1186 case KL_K210_REMOVE_PADDING:
1187 return "K210RemovePad";
1188 case KL_K210_UPLOAD:
1189 return "K210Upload";
1190 default:
1191 return "Unknown";
1192 }
[453]1193}
1194#endif
1195
1196static int kpu_kmodel_done(kpu_model_context_t *ctx)
1197{
[458]1198 kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1199 .calc_done_int = 1,
1200 .layer_cfg_almost_empty_int = 1,
1201 .layer_cfg_almost_full_int = 1};
1202 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1203 .calc_done_int = 1,
1204 .layer_cfg_almost_empty_int = 1,
1205 .layer_cfg_almost_full_int = 1};
[453]1206#if KPU_DEBUG
[458]1207 uint32_t cnt_layer_id = ctx->current_layer;
1208 uint64_t time = sysctl_get_time_us();
1209 if (last_time != 0)
1210 {
1211 uint64_t layer_time = time - last_time;
1212 syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
1213 total_time += layer_time;
1214 if (last_layer_type == KL_K210_CONV)
1215 kpu_time += layer_time;
1216 }
[453]1217
[458]1218 syslog(LOG_NOTICE, "KPU: %d.%03d ms", kpu_time / 1000, kpu_time % 1000);
1219 syslog(LOG_NOTICE, "CPU: %d.%03d ms", (total_time - kpu_time) / 1000, (total_time - kpu_time) % 1000);
1220 syslog(LOG_NOTICE, "Model: %d.%03d ms", total_time / 1000, total_time % 1000);
[453]1221#endif
[458]1222 ctx->done_callback(ctx->userdata);
1223 return 0;
[453]1224}
1225
1226static int ai_step(void *userdata)
1227{
[458]1228 kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
[453]1229
[458]1230 uint32_t cnt_layer_id = ctx->current_layer;
1231 const uint8_t *layer_body = ctx->current_body;
1232 const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
1233 if (cnt_layer_id >= ctx->layers_length)
1234 {
1235 //syslog(LOG_NOTICE, "overrun");
1236 kpu_kmodel_done(ctx);
1237 return -1;
1238 }
[453]1239
[458]1240 ctx->current_layer++;
1241 ctx->current_body += cnt_layer_header->body_size;
[453]1242
1243#if KPU_DEBUG
[458]1244 uint64_t time = sysctl_get_time_us();
1245 if (last_time != 0)
1246 {
1247 uint64_t layer_time = time - last_time;
1248 syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
1249 total_time += layer_time;
1250 if (last_layer_type == KL_K210_CONV)
1251 kpu_time += layer_time;
1252 }
[453]1253
[458]1254 last_layer_type = cnt_layer_header->type;
1255 last_time = sysctl_get_time_us();
[453]1256#endif
1257
[458]1258 switch (cnt_layer_header->type)
1259 {
1260 case KL_ADD:
1261 kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
1262 break;
1263 case KL_QUANTIZED_ADD:
1264 kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
1265 break;
1266 case KL_GLOBAL_AVERAGE_POOL2D:
1267 kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
1268 break;
1269 case KL_QUANTIZED_MAX_POOL2D:
1270 kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
1271 break;
1272 case KL_AVERAGE_POOL2D:
1273 kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
1274 break;
1275 case KL_QUANTIZE:
1276 kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
1277 break;
1278 case KL_DEQUANTIZE:
1279 kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
1280 break;
1281 case KL_REQUANTIZE:
1282 kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
1283 break;
1284 case KL_L2_NORMALIZATION:
1285 kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
1286 break;
1287 case KL_SOFTMAX:
1288 kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
1289 break;
1290 case KL_CONCAT:
1291 case KL_QUANTIZED_CONCAT:
1292 kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
1293 break;
1294 case KL_FULLY_CONNECTED:
1295 kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
1296 break;
1297 case KL_TENSORFLOW_FLATTEN:
1298 kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
1299 break;
1300 case KL_RESIZE_NEAREST_NEIGHBOR:
1301 kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
1302 break;
1303 case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
1304 kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
1305 break;
1306 case KL_CHANNELWISE_DEQUANTIZE:
1307 kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
1308 break;
1309 case KL_LOGISTIC:
1310 kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
1311 break;
1312 case KL_K210_CONV:
1313 kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
1314 return 0;
1315 case KL_K210_ADD_PADDING:
1316 kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
1317 break;
1318 case KL_K210_REMOVE_PADDING:
1319 kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
1320 break;
1321 case KL_K210_UPLOAD:
1322 kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
1323 break;
1324 default:
1325 assert(!"Layer is not supported.");
1326 kpu_kmodel_done(ctx);
1327 return -1;
1328 }
[453]1329
[458]1330 if (ctx->current_layer < (ctx->layers_length - 1))
1331 ai_step(userdata);
1332 else
1333 kpu_kmodel_done(ctx);
1334 return 0;
[453]1335}
1336
1337static void ai_step_not_isr(void *userdata)
1338{
[458]1339 dis_int(INTNO_DMAAI);
1340 dis_int(INTNO_AI);
1341
1342 ai_step(userdata);
1343
1344 ena_int(INTNO_DMAAI);
1345 ena_int(INTNO_AI);
[453]1346}
1347
1348int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
1349{
[458]1350 ctx->dma_ch = dma_ch;
1351 ctx->done_callback = done_callback;
1352 ctx->userdata = userdata;
1353 ctx->current_layer = 0;
1354 ctx->current_body = ctx->body_start;
[453]1355#if KPU_DEBUG
[458]1356 last_time = 0;
1357 total_time = 0;
1358 kpu_time = 0;
[453]1359#endif
1360
[458]1361 kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
1362 kpu->interrupt_clear.reg = 7;
1363 kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
1364 .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
1365 kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
1366 .eight_bit_mode = header->flags & 1};
1367 kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1368 .calc_done_int = 1,
1369 .layer_cfg_almost_empty_int = 0,
1370 .layer_cfg_almost_full_int = 1};
[453]1371
[458]1372 //plic_set_priority(INTNO_AI, 1);
1373 plic_irq_register(INTNO_AI, ai_step, ctx);
1374 plic_irq_enable(INTNO_AI);
[453]1375
[458]1376 const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
[453]1377
[458]1378 switch (first_layer_header->type)
1379 {
1380 case KL_K210_CONV:
1381 {
1382 const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
1383 kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
[453]1384
[458]1385 if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
1386 {
1387 kpu_kmodel_input_with_padding(&layer_arg, src);
1388 ai_step_not_isr(ctx);
1389 }
1390 else
1391 {
1392 kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
1393 }
1394 }
1395 break;
1396 case KL_FULLY_CONNECTED:
1397 {
1398 const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
1399 kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
1400 ai_step_not_isr(ctx);
1401 }
1402 break;
1403 default:
1404 return -1;
1405 }
[453]1406
[458]1407 return 0;
[453]1408}
[458]1409
1410ER kpu_init(kpu_model_context_t *ctx)
1411{
1412 g_ai_hdma.chnum = AI_DMA_CH;
1413 g_ai_hdma.xfercallback = ai_dma_done_isr;
1414 g_ai_hdma.errorcallback = NULL;
1415 g_ai_hdma.Init.Request = DMA_SELECT_AI_RX_REQ; /* DMA選択 */
1416 g_ai_hdma.Init.Direction = DMA_PERIPH_TO_MEMORY; /* DMA転送方向 */
1417 g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT; /* ソースマルチブロックタイプ */
1418 g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT; /* デスティネーションマルチブロックタイプ */
1419 g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE; /* ソースハンドシェイク */
1420 g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE; /* デスティネーションハンドシェイク */
1421 g_ai_hdma.Init.SrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* ソースハードウェアハンドシェイク極性 */
1422 g_ai_hdma.Init.DrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* デスティネーションハードウェアハンドシェイク極性 */
1423 g_ai_hdma.Init.Priority = 4; /* 優先度 */
1424 g_ai_hdma.Init.SrcMaster = DMAC_MASTER1; /* ソースマスター設定 */
1425 g_ai_hdma.Init.DstMaster = DMAC_MASTER2; /* デスティネーションマスター設定 */
1426 g_ai_hdma.Init.SrcInc = DMAC_ADDR_NOCHANGE; /* ソースインクリメント設定 */
1427 g_ai_hdma.Init.DstInc = DMAC_ADDR_INCREMENT; /* デスティネーションインクリメント設定 */
1428 g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32; /* ソース転送幅 */
1429 g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32; /* デスティネーション転送幅 */
1430 g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4; /* ソースバーストサイズ */
1431 g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4; /* デスティネーションバーストサイズ */
1432 g_ai_hdma.Init.IocBlkTrans = 0; /* IOCブロック転送 */
1433 g_ai_hdma.localdata = (void *)ctx;
1434
1435 return dma_init(&g_ai_hdma);
1436}
Note: See TracBrowser for help on using the repository browser.