[453] | 1 | #include <assert.h>
|
---|
| 2 | #include <float.h>
|
---|
| 3 | #include <math.h>
|
---|
| 4 | #include <stdio.h>
|
---|
| 5 | #include <stdlib.h>
|
---|
| 6 | #include <string.h>
|
---|
| 7 | #include <stdint.h>
|
---|
| 8 | #include <kernel.h>
|
---|
| 9 | #include <t_syslog.h>
|
---|
| 10 | #include <t_stdlib.h>
|
---|
| 11 | #include <kernel_impl.h>
|
---|
| 12 | #include <target_syssvc.h>
|
---|
| 13 | #include "kendryte-k210.h"
|
---|
| 14 | #include "device.h"
|
---|
| 15 | #include "atomic.h"
|
---|
| 16 | #include "kpu.h"
|
---|
| 17 | #include "utils.h"
|
---|
| 18 | #include "kpu_main.h"
|
---|
[458] | 19 | #include "kernel_cfg.h"
|
---|
[453] | 20 |
|
---|
| 21 | #define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b))
|
---|
| 22 |
|
---|
| 23 | uint64_t sysctl_get_time_us(void)
|
---|
| 24 | {
|
---|
[458] | 25 | uint64_t v_cycle = read_cycle();
|
---|
| 26 | return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
|
---|
[453] | 27 | }
|
---|
| 28 |
|
---|
| 29 | static int is_memory(uintptr_t address)
|
---|
| 30 | {
|
---|
[458] | 31 | enum
|
---|
| 32 | {
|
---|
| 33 | mem_len = 6 * 1024 * 1024,
|
---|
| 34 | mem_no_cache_len = 8 * 1024 * 1024,
|
---|
| 35 | };
|
---|
| 36 | return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040);
|
---|
[453] | 37 | }
|
---|
| 38 |
|
---|
| 39 | uint32_t is_memory_cache(uintptr_t address)
|
---|
| 40 | {
|
---|
[458] | 41 | #define MEM_CACHE_LEN (6 * 1024 * 1024)
|
---|
[453] | 42 |
|
---|
[458] | 43 | return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
|
---|
[453] | 44 | }
|
---|
| 45 |
|
---|
| 46 | int plic_irq_enable(INTNO irq_number)
|
---|
| 47 | {
|
---|
[458] | 48 | if (irq_number != INTNO_AI)
|
---|
| 49 | return -1;
|
---|
| 50 | ena_int(irq_number);
|
---|
| 51 | return 0;
|
---|
[453] | 52 | }
|
---|
| 53 |
|
---|
| 54 | int plic_set_priority(INTNO irq_number, uint32_t priority)
|
---|
| 55 | {
|
---|
[458] | 56 | if (irq_number != INTNO_AI)
|
---|
| 57 | return -1;
|
---|
| 58 | set_ipriority(irq_number, priority);
|
---|
| 59 | return 0;
|
---|
[453] | 60 | }
|
---|
| 61 |
|
---|
| 62 | plic_irq_callback_t ai_done_callback;
|
---|
| 63 | void *ai_done_ctx;
|
---|
| 64 |
|
---|
| 65 | void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
|
---|
| 66 | {
|
---|
[458] | 67 | if (irq != INTNO_AI)
|
---|
| 68 | return;
|
---|
[453] | 69 |
|
---|
[458] | 70 | dis_int(INTNO_AI);
|
---|
[453] | 71 |
|
---|
[458] | 72 | ai_done_callback = callback;
|
---|
| 73 | ai_done_ctx = ctx;
|
---|
[453] | 74 |
|
---|
[458] | 75 | ena_int(INTNO_AI);
|
---|
[453] | 76 | }
|
---|
| 77 |
|
---|
| 78 | void ai_done_isr(intptr_t exinf)
|
---|
| 79 | {
|
---|
[458] | 80 | dis_int(INTNO_AI);
|
---|
| 81 | if (ai_done_callback != NULL)
|
---|
| 82 | {
|
---|
| 83 | ai_done_callback(ai_done_ctx);
|
---|
| 84 | }
|
---|
| 85 | ena_int(INTNO_AI);
|
---|
[453] | 86 | }
|
---|
| 87 |
|
---|
| 88 | plic_irq_callback_t ai_dma_done_callback;
|
---|
| 89 | void *ai_dma_done_ctx;
|
---|
| 90 |
|
---|
| 91 | void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
|
---|
[458] | 92 | plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
|
---|
[453] | 93 | {
|
---|
[458] | 94 | if (channel_num != AI_DMA_CH)
|
---|
| 95 | return;
|
---|
[453] | 96 |
|
---|
[458] | 97 | //set_ipriority(INTNO_DMAAI, priority);
|
---|
[453] | 98 |
|
---|
[458] | 99 | dis_int(INTNO_DMAAI);
|
---|
[453] | 100 |
|
---|
[458] | 101 | ai_dma_done_callback = dmac_callback;
|
---|
| 102 | ai_dma_done_ctx = ctx;
|
---|
[453] | 103 |
|
---|
[458] | 104 | ena_int(INTNO_DMAAI);
|
---|
[453] | 105 | }
|
---|
| 106 |
|
---|
| 107 | void ai_dma_done_isr(DMA_Handle_t *dma)
|
---|
| 108 | {
|
---|
[458] | 109 | dis_int(INTNO_DMAAI);
|
---|
| 110 |
|
---|
| 111 | if (ai_dma_done_callback != NULL)
|
---|
| 112 | {
|
---|
| 113 | ai_dma_done_callback(ai_dma_done_ctx);
|
---|
| 114 | }
|
---|
| 115 |
|
---|
| 116 | ena_int(INTNO_DMAAI);
|
---|
[453] | 117 | }
|
---|
| 118 |
|
---|
| 119 | void dmac_set_irq(dmac_channel_number_t channel_num,
|
---|
[458] | 120 | plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
|
---|
[453] | 121 | {
|
---|
[458] | 122 | if (channel_num != AI_DMA_CH)
|
---|
| 123 | return;
|
---|
[453] | 124 |
|
---|
[458] | 125 | //set_ipriority(INTNO_DMAAI, priority);
|
---|
[453] | 126 |
|
---|
[458] | 127 | dis_int(INTNO_DMAAI);
|
---|
[453] | 128 |
|
---|
[458] | 129 | ai_dma_done_callback = dmac_callback;
|
---|
| 130 | ai_dma_done_ctx = ctx;
|
---|
[453] | 131 |
|
---|
[458] | 132 | ena_int(INTNO_DMAAI);
|
---|
[453] | 133 | }
|
---|
| 134 |
|
---|
| 135 | DMA_Handle_t g_ai_hdma;
|
---|
| 136 |
|
---|
| 137 | void dmac_set_single_mode(dmac_channel_number_t channel_num,
|
---|
[458] | 138 | const void *src, void *dest, uint8_t src_inc,
|
---|
| 139 | uint8_t dest_inc,
|
---|
| 140 | uint8_t dmac_burst_size,
|
---|
| 141 | uint8_t dmac_trans_width,
|
---|
| 142 | size_t block_size)
|
---|
[453] | 143 | {
|
---|
[458] | 144 | if (channel_num != AI_DMA_CH)
|
---|
| 145 | return;
|
---|
[453] | 146 |
|
---|
[458] | 147 | DMA_Handle_t *hdma = &g_ai_hdma;
|
---|
| 148 | int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
|
---|
| 149 | uint8_t flow_control;
|
---|
| 150 | if (mem_type_src == 0 && mem_type_dest == 0)
|
---|
| 151 | flow_control = DMA_PERIPH_TO_PERIPH;
|
---|
| 152 | else if (mem_type_src == 1 && mem_type_dest == 0)
|
---|
| 153 | flow_control = DMA_MEMORY_TO_PERIPH;
|
---|
| 154 | else if (mem_type_src == 0 && mem_type_dest == 1)
|
---|
| 155 | flow_control = DMA_PERIPH_TO_MEMORY;
|
---|
| 156 | else
|
---|
| 157 | flow_control = DMA_MEMORY_TO_MEMORY;
|
---|
[453] | 158 |
|
---|
[458] | 159 | hdma->Init.Direction = flow_control; /* DMA転送方向 */
|
---|
| 160 | hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
|
---|
| 161 | hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
|
---|
| 162 | hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
|
---|
| 163 | hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
|
---|
| 164 | hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
|
---|
| 165 | hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
|
---|
| 166 | hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
|
---|
| 167 | hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
|
---|
| 168 | dma_reset(hdma);
|
---|
[453] | 169 |
|
---|
[458] | 170 | dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
|
---|
[453] | 171 | }
|
---|
| 172 |
|
---|
| 173 | #define LAYER_BURST_SIZE 12
|
---|
| 174 |
|
---|
| 175 | #define KPU_DEBUG 0
|
---|
| 176 | #define USE_CACHED_AI_RAM 0
|
---|
| 177 |
|
---|
| 178 | #define min(a, b) (((a) < (b)) ? (a) : (b))
|
---|
| 179 | #define max(a, b) (((a) > (b)) ? (a) : (b))
|
---|
| 180 | #define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
|
---|
| 181 |
|
---|
| 182 | static int ai_step(void *userdata);
|
---|
| 183 | static int kpu_kmodel_done(kpu_model_context_t *ctx);
|
---|
| 184 |
|
---|
| 185 | volatile kpu_config_t *const kpu = (volatile kpu_config_t *)AI_BASE_ADDR;
|
---|
| 186 | static volatile uint32_t kpu_status;
|
---|
| 187 |
|
---|
| 188 | static void kpu_send_layer(const kpu_layer_argument_t *layer)
|
---|
| 189 | {
|
---|
[458] | 190 | kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
|
---|
| 191 | kpu->layer_argument_fifo = layer->image_addr.reg;
|
---|
| 192 | kpu->layer_argument_fifo = layer->image_channel_num.reg;
|
---|
| 193 | kpu->layer_argument_fifo = layer->image_size.reg;
|
---|
| 194 | kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
|
---|
| 195 | kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
|
---|
| 196 | kpu->layer_argument_fifo = layer->kernel_offset.reg;
|
---|
| 197 | kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
|
---|
| 198 | kpu->layer_argument_fifo = layer->write_back_cfg.reg;
|
---|
| 199 | kpu->layer_argument_fifo = layer->conv_value.reg;
|
---|
| 200 | kpu->layer_argument_fifo = layer->conv_value2.reg;
|
---|
| 201 | kpu->layer_argument_fifo = layer->dma_parameter.reg;
|
---|
[453] | 202 | }
|
---|
| 203 |
|
---|
| 204 | void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
|
---|
| 205 | {
|
---|
[458] | 206 | uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
|
---|
| 207 | dmac_set_irq(dma_ch, callback, userdata, 1);
|
---|
| 208 | dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
---|
| 209 | DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
|
---|
[453] | 210 | }
|
---|
| 211 |
|
---|
| 212 | static void kpu_conv2d_core(kpu_layer_argument_t *layer)
|
---|
| 213 | {
|
---|
[458] | 214 | kpu_send_layer(layer);
|
---|
[453] | 215 | }
|
---|
| 216 |
|
---|
| 217 | void kpu_conv2d(kpu_layer_argument_t *layer)
|
---|
| 218 | {
|
---|
[458] | 219 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 220 | .calc_done_int = 1,
|
---|
| 221 | .layer_cfg_almost_empty_int = 1,
|
---|
| 222 | .layer_cfg_almost_full_int = 1};
|
---|
| 223 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 224 | .calc_done_int = 1,
|
---|
| 225 | .layer_cfg_almost_empty_int = 0,
|
---|
| 226 | .layer_cfg_almost_full_int = 1};
|
---|
| 227 | kpu_conv2d_core(layer);
|
---|
[453] | 228 | }
|
---|
| 229 |
|
---|
| 230 | void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
|
---|
| 231 | {
|
---|
[458] | 232 | quantize_param_t q1 = *src_param, q2 = *dest_param;
|
---|
| 233 | size_t oc, y, x;
|
---|
[453] | 234 |
|
---|
[458] | 235 | if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
|
---|
| 236 | {
|
---|
| 237 | uint32_t row_padding = 16;
|
---|
| 238 | uint32_t row_group = 4;
|
---|
| 239 | uint32_t row_length = 1;
|
---|
| 240 | uint32_t height = 4;
|
---|
[453] | 241 |
|
---|
[458] | 242 | for (oc = 0; oc < channels; oc++)
|
---|
| 243 | {
|
---|
| 244 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
| 245 | for (y = 0; y < 1; y++)
|
---|
| 246 | {
|
---|
| 247 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
| 248 | for (x = 0; x < 1; x++)
|
---|
| 249 | {
|
---|
| 250 | int64_t sum = 0;
|
---|
| 251 | size_t i;
|
---|
| 252 | for (i = 0; i < kernel_size; i++)
|
---|
| 253 | sum += *src++;
|
---|
[453] | 254 |
|
---|
[458] | 255 | int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
|
---|
| 256 | if (value < 0)
|
---|
| 257 | value = 0;
|
---|
| 258 | if (value > 0xFF)
|
---|
| 259 | value = 0xFF;
|
---|
| 260 | y_origin[x] = value;
|
---|
| 261 | }
|
---|
| 262 | }
|
---|
| 263 | }
|
---|
| 264 | }
|
---|
| 265 | else
|
---|
| 266 | {
|
---|
| 267 | for (oc = 0; oc < channels; oc++)
|
---|
| 268 | {
|
---|
| 269 | int64_t sum = 0;
|
---|
| 270 | size_t i;
|
---|
| 271 | for (i = 0; i < kernel_size; i++)
|
---|
| 272 | sum += *src++;
|
---|
[453] | 273 |
|
---|
[458] | 274 | int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
|
---|
| 275 | if (value < 0)
|
---|
| 276 | value = 0;
|
---|
| 277 | if (value > 0xFF)
|
---|
| 278 | value = 0xFF;
|
---|
| 279 | dest[oc] = value;
|
---|
| 280 | }
|
---|
| 281 | }
|
---|
[453] | 282 | }
|
---|
| 283 |
|
---|
| 284 | void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
|
---|
| 285 | {
|
---|
[458] | 286 | quantize_param_t q = *src_param;
|
---|
| 287 | size_t oc;
|
---|
[453] | 288 |
|
---|
[458] | 289 | for (oc = 0; oc < channels; oc++)
|
---|
| 290 | {
|
---|
| 291 | int64_t sum = 0;
|
---|
| 292 | size_t i;
|
---|
| 293 | for (i = 0; i < kernel_size; i++)
|
---|
| 294 | sum += *src++;
|
---|
[453] | 295 |
|
---|
[458] | 296 | float value = (sum * q.scale + q.bias) / kernel_size;
|
---|
| 297 | dest[oc] = value;
|
---|
| 298 | }
|
---|
[453] | 299 | }
|
---|
| 300 |
|
---|
| 301 | #if USE_CACHED_AI_RAM
|
---|
| 302 | static void kpu_flush_cache(uint32_t addr, size_t lines)
|
---|
| 303 | {
|
---|
[458] | 304 | size_t line;
|
---|
| 305 | for (line = 0; line < lines; line++)
|
---|
| 306 | {
|
---|
| 307 | const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
|
---|
| 308 | uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
|
---|
| 309 | size_t i;
|
---|
| 310 | for (i = 0; i < 8; i++)
|
---|
| 311 | dest[i] = src[i];
|
---|
| 312 | }
|
---|
[453] | 313 | }
|
---|
| 314 | #endif
|
---|
| 315 | static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
|
---|
| 316 | {
|
---|
[458] | 317 | if (shift > 0)
|
---|
| 318 | {
|
---|
| 319 | value >>= shift - 1;
|
---|
| 320 | if (value & 0x1)
|
---|
| 321 | {
|
---|
| 322 | if (value < 0)
|
---|
| 323 | value = (value >> 1) - 1;
|
---|
| 324 | else
|
---|
| 325 | value = (value >> 1) + 1;
|
---|
| 326 | }
|
---|
| 327 | else
|
---|
| 328 | {
|
---|
| 329 | value >>= 1;
|
---|
| 330 | }
|
---|
| 331 | }
|
---|
[453] | 332 |
|
---|
[458] | 333 | return value;
|
---|
[453] | 334 | }
|
---|
| 335 | static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
|
---|
| 336 | {
|
---|
[458] | 337 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
|
---|
| 338 | size_t oc, y, x;
|
---|
| 339 | uint32_t row_padding;
|
---|
| 340 | uint32_t row_group;
|
---|
| 341 | uint32_t row_length;
|
---|
| 342 | if (width <= 16)
|
---|
| 343 | {
|
---|
| 344 | row_padding = 16;
|
---|
| 345 | row_group = 4;
|
---|
| 346 | row_length = 1;
|
---|
| 347 | }
|
---|
| 348 | else if (width <= 32)
|
---|
| 349 | {
|
---|
| 350 | row_padding = 32;
|
---|
| 351 | row_group = 2;
|
---|
| 352 | row_length = 1;
|
---|
| 353 | }
|
---|
| 354 | else
|
---|
| 355 | {
|
---|
| 356 | row_padding = 64;
|
---|
| 357 | row_group = 1;
|
---|
| 358 | row_length = (width + 63) / 64;
|
---|
| 359 | }
|
---|
[453] | 360 |
|
---|
[458] | 361 | if ((uintptr_t)src % 8 == 0 && width % 8 == 0)
|
---|
| 362 | {
|
---|
[453] | 363 | #define UPLOAD_BEGIN() \
|
---|
[458] | 364 | for (oc = 0; oc < channels; oc++) \
|
---|
| 365 | { \
|
---|
| 366 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
|
---|
| 367 | for (y = 0; y < height; y++) \
|
---|
| 368 | { \
|
---|
| 369 | uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
|
---|
[453] | 370 |
|
---|
| 371 | #define UPLOAD_END() \
|
---|
[458] | 372 | } \
|
---|
| 373 | }
|
---|
[453] | 374 |
|
---|
[458] | 375 | width /= 8;
|
---|
| 376 | const uint64_t *u64_src = (const uint64_t *)src;
|
---|
| 377 | if (width == 1)
|
---|
| 378 | {
|
---|
| 379 | UPLOAD_BEGIN()
|
---|
| 380 | y_origin[0] = *u64_src++;
|
---|
| 381 | UPLOAD_END()
|
---|
| 382 | }
|
---|
| 383 | else if (width == 2)
|
---|
| 384 | {
|
---|
| 385 | UPLOAD_BEGIN()
|
---|
| 386 | {
|
---|
| 387 | y_origin[0] = *u64_src++;
|
---|
| 388 | y_origin[1] = *u64_src++;
|
---|
| 389 | }
|
---|
| 390 | UPLOAD_END()
|
---|
| 391 | }
|
---|
| 392 | else if (width == 4)
|
---|
| 393 | {
|
---|
| 394 | UPLOAD_BEGIN()
|
---|
| 395 | {
|
---|
| 396 | y_origin[0] = *u64_src++;
|
---|
| 397 | y_origin[1] = *u64_src++;
|
---|
| 398 | y_origin[2] = *u64_src++;
|
---|
| 399 | y_origin[3] = *u64_src++;
|
---|
| 400 | }
|
---|
| 401 | UPLOAD_END()
|
---|
| 402 | }
|
---|
| 403 | else
|
---|
| 404 | {
|
---|
| 405 | UPLOAD_BEGIN()
|
---|
| 406 | for (x = 0; x < width; x++)
|
---|
| 407 | y_origin[x] = *u64_src++;
|
---|
| 408 | UPLOAD_END()
|
---|
| 409 | }
|
---|
| 410 | }
|
---|
| 411 | else
|
---|
| 412 | {
|
---|
| 413 | for (oc = 0; oc < channels; oc++)
|
---|
| 414 | {
|
---|
| 415 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
| 416 | for (y = 0; y < height; y++)
|
---|
| 417 | {
|
---|
| 418 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
| 419 | for (x = 0; x < width; x++)
|
---|
| 420 | y_origin[x] = *src++;
|
---|
| 421 | }
|
---|
| 422 | }
|
---|
| 423 | }
|
---|
[453] | 424 | }
|
---|
| 425 | static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
|
---|
| 426 | {
|
---|
[458] | 427 | size_t width = layer->image_size.data.i_row_wid + 1;
|
---|
| 428 | size_t height = layer->image_size.data.i_col_high + 1;
|
---|
| 429 | size_t channels = layer->image_channel_num.data.i_ch_num + 1;
|
---|
[453] | 430 |
|
---|
[458] | 431 | kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
|
---|
[453] | 432 | }
|
---|
| 433 |
|
---|
| 434 | static void kpu_kmodel_input_float(const float *src, float *dest, size_t count)
|
---|
| 435 | {
|
---|
[458] | 436 | memcpy(dest, src, count * sizeof(float));
|
---|
[453] | 437 | }
|
---|
| 438 |
|
---|
| 439 | static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
|
---|
| 440 | {
|
---|
[458] | 441 | size_t i;
|
---|
[453] | 442 |
|
---|
[458] | 443 | if (act == KLA_RELU)
|
---|
| 444 | {
|
---|
| 445 | for (i = 0; i < count; i++)
|
---|
| 446 | data[i] = max(data[i], 0);
|
---|
| 447 | }
|
---|
| 448 | else if (act == KLA_RELU6)
|
---|
| 449 | {
|
---|
| 450 | for (i = 0; i < count; i++)
|
---|
| 451 | data[i] = min(max(data[i], 0), 6);
|
---|
| 452 | }
|
---|
[453] | 453 | }
|
---|
| 454 |
|
---|
| 455 | static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 456 | {
|
---|
[458] | 457 | const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
|
---|
| 458 | const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
|
---|
| 459 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 460 | size_t i, count = arg->count;
|
---|
[453] | 461 |
|
---|
[458] | 462 | for (i = 0; i < count; i++)
|
---|
| 463 | dest[i] = src_a[i] + src_b[i];
|
---|
[453] | 464 | }
|
---|
| 465 |
|
---|
| 466 | static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 467 | {
|
---|
[458] | 468 | const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
|
---|
| 469 | const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
|
---|
| 470 | size_t count = ALIGN_UP(arg->count, 8) / 8;
|
---|
| 471 | int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
|
---|
| 472 | int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
|
---|
| 473 | int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
|
---|
[453] | 474 |
|
---|
[458] | 475 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 476 | size_t i;
|
---|
[453] | 477 |
|
---|
[458] | 478 | if (sh_a == sh_b)
|
---|
| 479 | {
|
---|
[453] | 480 | #define QADD_UNROLL_1(x) \
|
---|
[458] | 481 | int64_t a##x = *src_a++; \
|
---|
| 482 | int64_t b##x = *src_b++;
|
---|
[453] | 483 |
|
---|
| 484 | #define QADD_UNROLL_2(x) \
|
---|
[458] | 485 | a##x += off_a; \
|
---|
| 486 | b##x += off_b;
|
---|
[453] | 487 |
|
---|
| 488 | #define QADD_UNROLL_3(x) \
|
---|
[458] | 489 | a##x *= mul_a; \
|
---|
| 490 | b##x *= mul_b;
|
---|
[453] | 491 |
|
---|
| 492 | #define QADD_UNROLL_4(x) \
|
---|
[458] | 493 | int64_t v##x = a##x + b##x;
|
---|
[453] | 494 |
|
---|
| 495 | #define QADD_UNROLL_5(x) \
|
---|
[458] | 496 | v##x >>= sh_a;
|
---|
[453] | 497 |
|
---|
| 498 | #define QADD_UNROLL_6(x) \
|
---|
[458] | 499 | v##x *= mul_o;
|
---|
[453] | 500 |
|
---|
| 501 | #define QADD_UNROLL_7(x) \
|
---|
[458] | 502 | v##x = kpu_carry_shift(v##x, sh_o);
|
---|
[453] | 503 |
|
---|
| 504 | #define QADD_UNROLL_8(x) \
|
---|
[458] | 505 | v##x += off_o;
|
---|
[453] | 506 |
|
---|
| 507 | #define QADD_UNROLL_9(x) \
|
---|
[458] | 508 | v##x = min(0xFF, max(0, v##x));
|
---|
[453] | 509 |
|
---|
| 510 | #define QADD_UNROLL_10(x) \
|
---|
[458] | 511 | *dest++ = v##x;
|
---|
[453] | 512 |
|
---|
| 513 | #define QADD_UNROLL_S(x) \
|
---|
[458] | 514 | QADD_UNROLL_##x(0) \
|
---|
| 515 | QADD_UNROLL_##x(1) \
|
---|
| 516 | QADD_UNROLL_##x(2) \
|
---|
| 517 | QADD_UNROLL_##x(3) \
|
---|
| 518 | QADD_UNROLL_##x(4) \
|
---|
| 519 | QADD_UNROLL_##x(5) \
|
---|
| 520 | QADD_UNROLL_##x(6) \
|
---|
| 521 | QADD_UNROLL_##x(7)
|
---|
[453] | 522 |
|
---|
[458] | 523 | for (i = 0; i < count; i++)
|
---|
| 524 | {
|
---|
| 525 | QADD_UNROLL_S(1);
|
---|
| 526 | QADD_UNROLL_S(2);
|
---|
| 527 | QADD_UNROLL_S(3);
|
---|
| 528 | QADD_UNROLL_S(4);
|
---|
| 529 | QADD_UNROLL_S(5);
|
---|
| 530 | QADD_UNROLL_S(6);
|
---|
| 531 | QADD_UNROLL_S(7);
|
---|
| 532 | QADD_UNROLL_S(8);
|
---|
| 533 | QADD_UNROLL_S(9);
|
---|
| 534 | QADD_UNROLL_S(10);
|
---|
| 535 | }
|
---|
| 536 | }
|
---|
| 537 | else
|
---|
| 538 | {
|
---|
[453] | 539 | #undef QADD_UNROLL_1
|
---|
| 540 | #define QADD_UNROLL_1(x) \
|
---|
[458] | 541 | int64_t a##x = *src_a++; \
|
---|
| 542 | int64_t b##x = *src_b++;
|
---|
[453] | 543 |
|
---|
| 544 | #undef QADD_UNROLL_2
|
---|
| 545 | #define QADD_UNROLL_2(x) \
|
---|
[458] | 546 | a##x += off_a; \
|
---|
| 547 | b##x += off_b;
|
---|
[453] | 548 |
|
---|
| 549 | #undef QADD_UNROLL_3
|
---|
| 550 | #define QADD_UNROLL_3(x) \
|
---|
[458] | 551 | a##x *= mul_a; \
|
---|
| 552 | b##x *= mul_b;
|
---|
[453] | 553 |
|
---|
| 554 | #undef QADD_UNROLL_4
|
---|
| 555 | #define QADD_UNROLL_4(x) \
|
---|
[458] | 556 | a##x >>= sh_a; \
|
---|
| 557 | b##x >>= sh_b;
|
---|
[453] | 558 |
|
---|
| 559 | #undef QADD_UNROLL_5
|
---|
| 560 | #define QADD_UNROLL_5(x) \
|
---|
[458] | 561 | int64_t v##x = a##x + b##x;
|
---|
[453] | 562 |
|
---|
| 563 | #undef QADD_UNROLL_6
|
---|
| 564 | #define QADD_UNROLL_6(x) \
|
---|
[458] | 565 | v##x *= mul_o;
|
---|
[453] | 566 |
|
---|
| 567 | #undef QADD_UNROLL_7
|
---|
| 568 | #define QADD_UNROLL_7(x) \
|
---|
[458] | 569 | v##x = kpu_carry_shift(v##x, sh_o);
|
---|
[453] | 570 |
|
---|
| 571 | #undef QADD_UNROLL_8
|
---|
| 572 | #define QADD_UNROLL_8(x) \
|
---|
[458] | 573 | v##x += off_o;
|
---|
[453] | 574 |
|
---|
| 575 | #undef QADD_UNROLL_9
|
---|
| 576 | #define QADD_UNROLL_9(x) \
|
---|
[458] | 577 | v##x = min(0xFF, max(0, v##x));
|
---|
[453] | 578 |
|
---|
| 579 | #undef QADD_UNROLL_10
|
---|
| 580 | #define QADD_UNROLL_10(x) \
|
---|
[458] | 581 | *dest++ = v##x;
|
---|
[453] | 582 |
|
---|
| 583 | #undef QADD_UNROLL_S
|
---|
| 584 | #define QADD_UNROLL_S(x) \
|
---|
[458] | 585 | QADD_UNROLL_##x(0) \
|
---|
| 586 | QADD_UNROLL_##x(1) \
|
---|
| 587 | QADD_UNROLL_##x(2) \
|
---|
| 588 | QADD_UNROLL_##x(3) \
|
---|
| 589 | QADD_UNROLL_##x(4) \
|
---|
| 590 | QADD_UNROLL_##x(5) \
|
---|
| 591 | QADD_UNROLL_##x(6) \
|
---|
| 592 | QADD_UNROLL_##x(7)
|
---|
[453] | 593 |
|
---|
[458] | 594 | for (i = 0; i < count; i++)
|
---|
| 595 | {
|
---|
| 596 | QADD_UNROLL_S(1);
|
---|
| 597 | QADD_UNROLL_S(2);
|
---|
| 598 | QADD_UNROLL_S(3);
|
---|
| 599 | QADD_UNROLL_S(4);
|
---|
| 600 | QADD_UNROLL_S(5);
|
---|
| 601 | QADD_UNROLL_S(6);
|
---|
| 602 | QADD_UNROLL_S(7);
|
---|
| 603 | QADD_UNROLL_S(8);
|
---|
| 604 | QADD_UNROLL_S(9);
|
---|
| 605 | QADD_UNROLL_S(10);
|
---|
| 606 | }
|
---|
| 607 | }
|
---|
[453] | 608 | }
|
---|
| 609 |
|
---|
| 610 | static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 611 | {
|
---|
[458] | 612 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 613 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 614 | size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
|
---|
[453] | 615 |
|
---|
[458] | 616 | for (oc = 0; oc < channels; oc++)
|
---|
| 617 | {
|
---|
| 618 | float sum = 0.f;
|
---|
| 619 | size_t i;
|
---|
| 620 | for (i = 0; i < kernel_size; i++)
|
---|
| 621 | sum += *src++;
|
---|
[453] | 622 |
|
---|
[458] | 623 | dest[oc] = sum / kernel_size;
|
---|
| 624 | }
|
---|
[453] | 625 | }
|
---|
| 626 |
|
---|
| 627 | static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 628 | {
|
---|
[458] | 629 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 630 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 631 | kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
|
---|
| 632 | uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
|
---|
| 633 | uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
|
---|
| 634 | uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
|
---|
[453] | 635 |
|
---|
[458] | 636 | uint32_t out_y, out_x, oc;
|
---|
[453] | 637 |
|
---|
[458] | 638 | for (oc = 0; oc < out_shape.channels; oc++)
|
---|
| 639 | {
|
---|
| 640 | const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
| 641 | for (out_y = 0; out_y < out_shape.height; out_y++)
|
---|
| 642 | {
|
---|
| 643 | for (out_x = 0; out_x < out_shape.width; out_x++)
|
---|
| 644 | {
|
---|
| 645 | int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
|
---|
| 646 | int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
|
---|
| 647 | int32_t kernel_x_start = max(0, -in_x_origin);
|
---|
| 648 | int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
|
---|
| 649 | int32_t kernel_y_start = max(0, -in_y_origin);
|
---|
| 650 | int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
|
---|
| 651 | uint8_t value = 0;
|
---|
[453] | 652 |
|
---|
[458] | 653 | int32_t kernel_y, kernel_x;
|
---|
| 654 | for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
|
---|
| 655 | {
|
---|
| 656 | for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
|
---|
| 657 | {
|
---|
| 658 | int32_t in_x = in_x_origin + kernel_x;
|
---|
| 659 | int32_t in_y = in_y_origin + kernel_y;
|
---|
| 660 | value = max(value, channel_src[in_y * in_shape.width + in_x]);
|
---|
| 661 | }
|
---|
| 662 | }
|
---|
[453] | 663 |
|
---|
[458] | 664 | *dest++ = value;
|
---|
| 665 | }
|
---|
| 666 | }
|
---|
| 667 | }
|
---|
[453] | 668 | }
|
---|
| 669 |
|
---|
| 670 | static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 671 | {
|
---|
[458] | 672 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 673 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 674 | kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
|
---|
| 675 | uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
|
---|
| 676 | uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
|
---|
| 677 | uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
|
---|
[453] | 678 |
|
---|
[458] | 679 | uint32_t out_y, out_x, oc;
|
---|
[453] | 680 |
|
---|
[458] | 681 | for (oc = 0; oc < out_shape.channels; oc++)
|
---|
| 682 | {
|
---|
| 683 | const float *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
| 684 | for (out_y = 0; out_y < out_shape.height; out_y++)
|
---|
| 685 | {
|
---|
| 686 | for (out_x = 0; out_x < out_shape.width; out_x++)
|
---|
| 687 | {
|
---|
| 688 | int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
|
---|
| 689 | int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
|
---|
| 690 | int32_t kernel_x_start = max(0, -in_x_origin);
|
---|
| 691 | int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
|
---|
| 692 | int32_t kernel_y_start = max(0, -in_y_origin);
|
---|
| 693 | int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
|
---|
| 694 | float value = 0;
|
---|
| 695 | float kernel_count = 0;
|
---|
[453] | 696 |
|
---|
[458] | 697 | int32_t kernel_y, kernel_x;
|
---|
| 698 | for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
|
---|
| 699 | {
|
---|
| 700 | for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
|
---|
| 701 | {
|
---|
| 702 | int32_t in_x = in_x_origin + kernel_x;
|
---|
| 703 | int32_t in_y = in_y_origin + kernel_y;
|
---|
| 704 | value += channel_src[in_y * in_shape.width + in_x];
|
---|
| 705 | kernel_count++;
|
---|
| 706 | }
|
---|
| 707 | }
|
---|
[453] | 708 |
|
---|
[458] | 709 | *dest++ = value / kernel_count;
|
---|
| 710 | }
|
---|
| 711 | }
|
---|
| 712 | }
|
---|
[453] | 713 | }
|
---|
| 714 |
|
---|
| 715 | static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 716 | {
|
---|
[458] | 717 | size_t count = arg->count;
|
---|
| 718 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
[453] | 719 |
|
---|
[458] | 720 | kpu_model_quant_param_t q = arg->quant_param;
|
---|
[453] | 721 |
|
---|
[458] | 722 | float scale = 1.f / q.scale;
|
---|
[453] | 723 |
|
---|
[458] | 724 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
|
---|
| 725 | size_t i;
|
---|
| 726 | for (i = 0; i < count; i++)
|
---|
| 727 | {
|
---|
| 728 | int value = roundf((*src++ - q.bias) * scale);
|
---|
| 729 | if (value < 0)
|
---|
| 730 | value = 0;
|
---|
| 731 | if (value > 0xFF)
|
---|
| 732 | value = 0xFF;
|
---|
| 733 | *dest++ = (uint8_t)value;
|
---|
| 734 | }
|
---|
[453] | 735 | }
|
---|
| 736 |
|
---|
| 737 | static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 738 | {
|
---|
[458] | 739 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 740 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 741 | size_t oc, count = arg->count;
|
---|
| 742 | kpu_model_quant_param_t q = arg->quant_param;
|
---|
[453] | 743 |
|
---|
[458] | 744 | for (oc = 0; oc < count; oc++)
|
---|
| 745 | dest[oc] = *src++ * q.scale + q.bias;
|
---|
[453] | 746 | }
|
---|
| 747 |
|
---|
| 748 | static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 749 | {
|
---|
[458] | 750 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 751 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 752 | size_t oc, i, channels = arg->channels, count = arg->channel_size;
|
---|
[453] | 753 |
|
---|
[458] | 754 | for (oc = 0; oc < channels; oc++)
|
---|
| 755 | {
|
---|
| 756 | const kpu_model_quant_param_t q = arg->quant_params[oc];
|
---|
[453] | 757 |
|
---|
[458] | 758 | for (i = 0; i < count; i++)
|
---|
| 759 | *dest++ = *src++ * q.scale + q.bias;
|
---|
| 760 | }
|
---|
[453] | 761 | }
|
---|
| 762 |
|
---|
| 763 | static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 764 | {
|
---|
[458] | 765 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 766 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 767 | size_t oc, count = arg->count;
|
---|
| 768 | const uint8_t *table = arg->table;
|
---|
[453] | 769 |
|
---|
[458] | 770 | if (false && count % 8 == 0)
|
---|
| 771 | {
|
---|
| 772 | for (oc = 0; oc < count;)
|
---|
| 773 | {
|
---|
| 774 | dest[oc++] = table[*src++];
|
---|
| 775 | dest[oc++] = table[*src++];
|
---|
| 776 | dest[oc++] = table[*src++];
|
---|
| 777 | dest[oc++] = table[*src++];
|
---|
| 778 | dest[oc++] = table[*src++];
|
---|
| 779 | dest[oc++] = table[*src++];
|
---|
| 780 | dest[oc++] = table[*src++];
|
---|
| 781 | dest[oc++] = table[*src++];
|
---|
| 782 | }
|
---|
| 783 | }
|
---|
| 784 | else
|
---|
| 785 | {
|
---|
| 786 | for (oc = 0; oc < count; oc++)
|
---|
| 787 | dest[oc] = table[src[oc]];
|
---|
| 788 | }
|
---|
[453] | 789 | }
|
---|
| 790 |
|
---|
| 791 | static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 792 | {
|
---|
[458] | 793 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 794 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 795 | size_t oc, channels = arg->channels;
|
---|
[453] | 796 |
|
---|
[458] | 797 | float sum = 0.f;
|
---|
| 798 | const float epsilon = 1e-10f;
|
---|
| 799 | for (oc = 0; oc < channels; oc++)
|
---|
| 800 | sum += src[oc] * src[oc];
|
---|
| 801 | if (sum < epsilon)
|
---|
| 802 | sum = epsilon;
|
---|
| 803 | sum = 1.f / sqrtf(sum);
|
---|
| 804 | for (oc = 0; oc < channels; oc++)
|
---|
| 805 | dest[oc] = src[oc] * sum;
|
---|
[453] | 806 | }
|
---|
| 807 |
|
---|
| 808 | static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 809 | {
|
---|
[458] | 810 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 811 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 812 | size_t oc, channels = arg->channels;
|
---|
[453] | 813 |
|
---|
[458] | 814 | float max = FLT_MIN;
|
---|
| 815 | for (oc = 0; oc < channels; oc++)
|
---|
| 816 | max = fmaxf(max, src[oc]);
|
---|
[453] | 817 |
|
---|
[458] | 818 | float sum = 0.f;
|
---|
| 819 | for (oc = 0; oc < channels; oc++)
|
---|
| 820 | {
|
---|
| 821 | float value = expf(src[oc] - max);
|
---|
| 822 | sum += value;
|
---|
| 823 | dest[oc] = value;
|
---|
| 824 | }
|
---|
[453] | 825 |
|
---|
[458] | 826 | for (oc = 0; oc < channels; oc++)
|
---|
| 827 | dest[oc] /= sum;
|
---|
[453] | 828 | }
|
---|
| 829 |
|
---|
| 830 | static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 831 | {
|
---|
[458] | 832 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 833 | uint32_t count = arg->input_count, i;
|
---|
[453] | 834 |
|
---|
[458] | 835 | for (i = 0; i < count; i++)
|
---|
| 836 | {
|
---|
| 837 | kpu_model_memory_range_t input = arg->inputs_mem[i];
|
---|
| 838 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
|
---|
| 839 | memcpy(dest, src, input.size);
|
---|
| 840 | dest += input.size;
|
---|
| 841 | }
|
---|
[453] | 842 | }
|
---|
| 843 |
|
---|
| 844 | static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 845 | {
|
---|
[458] | 846 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 847 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 848 | uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
|
---|
| 849 | float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
|
---|
| 850 | float *bias = (float *)malloc(out_channels * sizeof(float));
|
---|
| 851 | memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
|
---|
| 852 | memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
|
---|
[453] | 853 |
|
---|
[458] | 854 | if (in_channels % 8 == 0)
|
---|
| 855 | {
|
---|
[453] | 856 | #define FC_UNROLL_1(x) \
|
---|
[458] | 857 | float i##x = *c_src++; \
|
---|
| 858 | float w##x = *c_weights++;
|
---|
[453] | 859 |
|
---|
| 860 | #define FC_UNROLL_2(x) \
|
---|
[458] | 861 | sum += i##x * w##x;
|
---|
[453] | 862 |
|
---|
| 863 | #define FC_UNROLL_S(x) \
|
---|
[458] | 864 | FC_UNROLL_##x(0) \
|
---|
| 865 | FC_UNROLL_##x(1) \
|
---|
| 866 | FC_UNROLL_##x(2) \
|
---|
| 867 | FC_UNROLL_##x(3) \
|
---|
| 868 | FC_UNROLL_##x(4) \
|
---|
| 869 | FC_UNROLL_##x(5) \
|
---|
| 870 | FC_UNROLL_##x(6) \
|
---|
| 871 | FC_UNROLL_##x(7)
|
---|
[453] | 872 |
|
---|
[458] | 873 | for (oc = 0; oc < out_channels; oc++)
|
---|
| 874 | {
|
---|
| 875 | const float *c_src = src;
|
---|
| 876 | const float *c_weights = weights + oc * in_channels;
|
---|
[453] | 877 |
|
---|
[458] | 878 | float sum = 0.0f;
|
---|
| 879 | for (ic = 0; ic < in_channels / 8; ic++)
|
---|
| 880 | {
|
---|
| 881 | FC_UNROLL_S(1);
|
---|
| 882 | FC_UNROLL_S(2);
|
---|
| 883 | }
|
---|
[453] | 884 |
|
---|
[458] | 885 | dest[oc] = sum + bias[oc];
|
---|
| 886 | }
|
---|
| 887 | }
|
---|
| 888 | else
|
---|
| 889 | {
|
---|
| 890 | for (oc = 0; oc < out_channels; oc++)
|
---|
| 891 | {
|
---|
| 892 | const float *c_weights = weights + oc * in_channels;
|
---|
[453] | 893 |
|
---|
[458] | 894 | float sum = 0.0f;
|
---|
| 895 | for (ic = 0; ic < in_channels; ic++)
|
---|
| 896 | sum += src[ic] * c_weights[ic];
|
---|
| 897 | dest[oc] = sum + bias[oc];
|
---|
| 898 | }
|
---|
| 899 | }
|
---|
| 900 | free(weights);
|
---|
| 901 | free(bias);
|
---|
| 902 | kpu_float_activation(dest, out_channels, arg->act);
|
---|
[453] | 903 | }
|
---|
| 904 |
|
---|
| 905 | static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 906 | {
|
---|
[458] | 907 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 908 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 909 | kpu_model_shape_t in_shape = arg->shape;
|
---|
| 910 | uint32_t oc, oy, ox;
|
---|
[453] | 911 |
|
---|
[458] | 912 | for (oy = 0; oy < in_shape.height; oy++)
|
---|
| 913 | for (ox = 0; ox < in_shape.width; ox++)
|
---|
| 914 | for (oc = 0; oc < in_shape.channels; oc++)
|
---|
| 915 | *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
|
---|
[453] | 916 | }
|
---|
| 917 |
|
---|
| 918 | static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 919 | {
|
---|
[458] | 920 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 921 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 922 | kpu_model_shape_t in_shape = arg->in_shape;
|
---|
| 923 | uint32_t out_width = arg->out_width, out_height = arg->out_height;
|
---|
| 924 | uint32_t oc, oy, ox;
|
---|
[453] | 925 |
|
---|
[458] | 926 | float height_scale = (float)in_shape.height / out_height;
|
---|
| 927 | float width_scale = (float)in_shape.width / out_width;
|
---|
[453] | 928 |
|
---|
[458] | 929 | for (oc = 0; oc < in_shape.channels; oc++)
|
---|
| 930 | {
|
---|
| 931 | const float *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
| 932 | for (oy = 0; oy < out_height; oy++)
|
---|
| 933 | {
|
---|
| 934 | uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
|
---|
| 935 | const float *y_origin = channel_src + in_y * in_shape.width;
|
---|
| 936 | for (ox = 0; ox < out_width; ox++)
|
---|
| 937 | {
|
---|
| 938 | uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
|
---|
| 939 | *dest++ = y_origin[in_x];
|
---|
| 940 | }
|
---|
| 941 | }
|
---|
| 942 | }
|
---|
[453] | 943 | }
|
---|
| 944 |
|
---|
| 945 | static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 946 | {
|
---|
[458] | 947 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 948 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 949 | kpu_model_shape_t in_shape = arg->in_shape;
|
---|
| 950 | uint32_t out_width = arg->out_width, out_height = arg->out_height;
|
---|
| 951 | uint32_t oc, oy, ox;
|
---|
[453] | 952 |
|
---|
[458] | 953 | float height_scale = (float)in_shape.height / out_height;
|
---|
| 954 | float width_scale = (float)in_shape.width / out_width;
|
---|
[453] | 955 |
|
---|
[458] | 956 | for (oc = 0; oc < in_shape.channels; oc++)
|
---|
| 957 | {
|
---|
| 958 | const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
| 959 | for (oy = 0; oy < out_height; oy++)
|
---|
| 960 | {
|
---|
| 961 | uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
|
---|
| 962 | const uint8_t *y_origin = channel_src + in_y * in_shape.width;
|
---|
| 963 | for (ox = 0; ox < out_width; ox++)
|
---|
| 964 | {
|
---|
| 965 | uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
|
---|
| 966 | *dest++ = y_origin[in_x];
|
---|
| 967 | }
|
---|
| 968 | }
|
---|
| 969 | }
|
---|
[453] | 970 | }
|
---|
| 971 |
|
---|
| 972 | static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 973 | {
|
---|
[458] | 974 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 975 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 976 | size_t oc, channels = arg->channels;
|
---|
[453] | 977 |
|
---|
[458] | 978 | for (oc = 0; oc < channels; oc++)
|
---|
| 979 | dest[oc] = 1.f / (1.f + expf(-src[oc]));
|
---|
[453] | 980 | }
|
---|
| 981 |
|
---|
| 982 | static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 983 | {
|
---|
[458] | 984 | volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
|
---|
| 985 | layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
|
---|
| 986 | layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
|
---|
| 987 | layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
|
---|
[453] | 988 |
|
---|
[458] | 989 | if (arg->flags & KLF_MAIN_MEM_OUT)
|
---|
| 990 | {
|
---|
| 991 | dmac_channel_number_t dma_ch = ctx->dma_ch;
|
---|
| 992 | uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
|
---|
| 993 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 994 | .calc_done_int = 1,
|
---|
| 995 | .layer_cfg_almost_empty_int = 1,
|
---|
| 996 | .layer_cfg_almost_full_int = 1};
|
---|
| 997 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 998 | .calc_done_int = 1,
|
---|
| 999 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1000 | .layer_cfg_almost_full_int = 1};
|
---|
| 1001 | layer.dma_parameter.data.send_data_out = 1;
|
---|
| 1002 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
| 1003 | if (ctx->current_layer < ctx->layers_length)
|
---|
| 1004 | dmac_set_irq(dma_ch, ai_step, ctx, 1);
|
---|
| 1005 | else
|
---|
| 1006 | dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
|
---|
| 1007 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
| 1008 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
|
---|
| 1009 | }
|
---|
| 1010 | else
|
---|
| 1011 | {
|
---|
| 1012 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 1013 | .calc_done_int = 1,
|
---|
| 1014 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1015 | .layer_cfg_almost_full_int = 1};
|
---|
[453] | 1016 |
|
---|
[458] | 1017 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 1018 | .calc_done_int = 0,
|
---|
| 1019 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1020 | .layer_cfg_almost_full_int = 1};
|
---|
| 1021 | layer.interrupt_enabe.data.int_en = 1;
|
---|
| 1022 | }
|
---|
[453] | 1023 |
|
---|
[458] | 1024 | kpu_send_layer((const kpu_layer_argument_t *)&layer);
|
---|
[453] | 1025 | }
|
---|
| 1026 |
|
---|
| 1027 | static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1028 | {
|
---|
[458] | 1029 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
[453] | 1030 | #if USE_CACHED_AI_RAM
|
---|
[458] | 1031 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
|
---|
[453] | 1032 | #else
|
---|
[458] | 1033 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
|
---|
[453] | 1034 | #endif
|
---|
| 1035 |
|
---|
[458] | 1036 | uint32_t row_padding = 16;
|
---|
| 1037 | uint32_t row_group = 4;
|
---|
| 1038 | uint32_t row_length = 1;
|
---|
| 1039 | uint32_t height = 4;
|
---|
| 1040 | uint32_t oc, x, y, channels = arg->channels;
|
---|
[453] | 1041 |
|
---|
[458] | 1042 | for (oc = 0; oc < channels; oc++)
|
---|
| 1043 | {
|
---|
| 1044 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
| 1045 | for (y = 0; y < 1; y++)
|
---|
| 1046 | {
|
---|
| 1047 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
| 1048 | for (x = 0; x < 1; x++)
|
---|
| 1049 | y_origin[x] = *src++;
|
---|
| 1050 | }
|
---|
| 1051 | }
|
---|
[453] | 1052 |
|
---|
| 1053 | #if USE_CACHED_AI_RAM
|
---|
[458] | 1054 | uint32_t lines = row_length * height * channels / row_group;
|
---|
| 1055 | kpu_flush_cache(arg->kpu_mem_out_address, lines);
|
---|
[453] | 1056 | #endif
|
---|
| 1057 | }
|
---|
| 1058 |
|
---|
| 1059 | static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1060 | {
|
---|
[458] | 1061 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1062 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1063 | uint32_t oc, channels = arg->channels;
|
---|
[453] | 1064 |
|
---|
[458] | 1065 | for (oc = 0; oc < channels; oc++)
|
---|
| 1066 | *dest++ = src[oc * 16];
|
---|
[453] | 1067 | }
|
---|
| 1068 |
|
---|
| 1069 | static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1070 | {
|
---|
[458] | 1071 | size_t width = arg->width;
|
---|
| 1072 | size_t height = arg->height;
|
---|
| 1073 | size_t channels = arg->channels;
|
---|
[453] | 1074 |
|
---|
[458] | 1075 | kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
|
---|
[453] | 1076 | }
|
---|
| 1077 |
|
---|
| 1078 | int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
---|
| 1079 | {
|
---|
| 1080 | #if FIX_CACHE
|
---|
[458] | 1081 | configASSERT(is_memory_cache((uintptr_t)buffer));
|
---|
[453] | 1082 | #endif
|
---|
[458] | 1083 | uintptr_t base_addr = (uintptr_t)buffer;
|
---|
| 1084 | const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
|
---|
[453] | 1085 |
|
---|
[458] | 1086 | if (header->version == 3 && header->arch == 0)
|
---|
| 1087 | {
|
---|
| 1088 | ctx->model_buffer = buffer;
|
---|
| 1089 | ctx->output_count = header->output_count;
|
---|
| 1090 | ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
|
---|
| 1091 | ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
|
---|
| 1092 | ctx->layers_length = header->layers_length;
|
---|
| 1093 | ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
|
---|
| 1094 | ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
|
---|
| 1095 | if (!ctx->main_buffer)
|
---|
| 1096 | return -1;
|
---|
| 1097 | uint32_t body_size = 0;
|
---|
| 1098 | for (int i = 0; i < ctx->layers_length; i++)
|
---|
| 1099 | {
|
---|
| 1100 | const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
|
---|
| 1101 | body_size += cnt_layer_header->body_size;
|
---|
| 1102 | }
|
---|
| 1103 | uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
|
---|
| 1104 | const uint8_t *body_start_cache = ctx->body_start;
|
---|
| 1105 | memcpy(body_start_iomem, body_start_cache, body_size);
|
---|
| 1106 | for (int i = 0; i < body_size; i++)
|
---|
| 1107 | {
|
---|
| 1108 | configASSERT(body_start_iomem[i] == body_start_cache[i]);
|
---|
| 1109 | }
|
---|
| 1110 | }
|
---|
| 1111 | else
|
---|
| 1112 | {
|
---|
| 1113 | return -1;
|
---|
| 1114 | }
|
---|
[453] | 1115 |
|
---|
[458] | 1116 | return 0;
|
---|
[453] | 1117 | }
|
---|
| 1118 |
|
---|
| 1119 | int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
|
---|
| 1120 | {
|
---|
[458] | 1121 | if (index >= ctx->output_count)
|
---|
| 1122 | return -1;
|
---|
[453] | 1123 |
|
---|
[458] | 1124 | const kpu_model_output_t *output = ctx->outputs + index;
|
---|
| 1125 | *data = ctx->main_buffer + output->address;
|
---|
| 1126 | *size = output->size;
|
---|
| 1127 | return 0;
|
---|
[453] | 1128 | }
|
---|
| 1129 |
|
---|
| 1130 | void kpu_model_free(kpu_model_context_t *ctx)
|
---|
| 1131 | {
|
---|
[458] | 1132 | free(ctx->main_buffer);
|
---|
| 1133 | ctx->main_buffer = NULL;
|
---|
[453] | 1134 | }
|
---|
| 1135 |
|
---|
| 1136 | #if KPU_DEBUG
|
---|
| 1137 | static uint64_t last_time;
|
---|
| 1138 | static uint64_t total_time;
|
---|
| 1139 | static uint64_t kpu_time;
|
---|
| 1140 | static uint32_t last_layer_type;
|
---|
| 1141 |
|
---|
| 1142 | static const char *str_layer_type(uint32_t type)
|
---|
| 1143 | {
|
---|
[458] | 1144 | switch (type)
|
---|
| 1145 | {
|
---|
| 1146 | case KL_ADD:
|
---|
| 1147 | return "Add";
|
---|
| 1148 | case KL_QUANTIZED_ADD:
|
---|
| 1149 | return "QuantAdd";
|
---|
| 1150 | case KL_GLOBAL_AVERAGE_POOL2D:
|
---|
| 1151 | return "GAP";
|
---|
| 1152 | case KL_QUANTIZED_MAX_POOL2D:
|
---|
| 1153 | return "QuantMaxPool2d";
|
---|
| 1154 | case KL_AVERAGE_POOL2D:
|
---|
| 1155 | return "AveragePool2d";
|
---|
| 1156 | case KL_QUANTIZE:
|
---|
| 1157 | return "Quantize";
|
---|
| 1158 | case KL_DEQUANTIZE:
|
---|
| 1159 | return "Dequantize";
|
---|
| 1160 | case KL_REQUANTIZE:
|
---|
| 1161 | return "Requantize";
|
---|
| 1162 | case KL_L2_NORMALIZATION:
|
---|
| 1163 | return "L2Norm";
|
---|
| 1164 | case KL_SOFTMAX:
|
---|
| 1165 | return "Softmax";
|
---|
| 1166 | case KL_CONCAT:
|
---|
| 1167 | return "Concat";
|
---|
| 1168 | case KL_QUANTIZED_CONCAT:
|
---|
| 1169 | return "QuantConcat";
|
---|
| 1170 | case KL_FULLY_CONNECTED:
|
---|
| 1171 | return "FullyConnected";
|
---|
| 1172 | case KL_TENSORFLOW_FLATTEN:
|
---|
| 1173 | return "TFFlatten";
|
---|
| 1174 | case KL_RESIZE_NEAREST_NEIGHBOR:
|
---|
| 1175 | return "ResizeNearestNeighbor";
|
---|
| 1176 | case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
|
---|
| 1177 | return "QuantResizeNearestNeighbor";
|
---|
| 1178 | case KL_CHANNELWISE_DEQUANTIZE:
|
---|
| 1179 | return "ChannelwiseDequantize";
|
---|
| 1180 | case KL_LOGISTIC:
|
---|
| 1181 | return "Logistic";
|
---|
| 1182 | case KL_K210_CONV:
|
---|
| 1183 | return "K210Conv";
|
---|
| 1184 | case KL_K210_ADD_PADDING:
|
---|
| 1185 | return "K210AddPad";
|
---|
| 1186 | case KL_K210_REMOVE_PADDING:
|
---|
| 1187 | return "K210RemovePad";
|
---|
| 1188 | case KL_K210_UPLOAD:
|
---|
| 1189 | return "K210Upload";
|
---|
| 1190 | default:
|
---|
| 1191 | return "Unknown";
|
---|
| 1192 | }
|
---|
[453] | 1193 | }
|
---|
| 1194 | #endif
|
---|
| 1195 |
|
---|
| 1196 | static int kpu_kmodel_done(kpu_model_context_t *ctx)
|
---|
| 1197 | {
|
---|
[458] | 1198 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 1199 | .calc_done_int = 1,
|
---|
| 1200 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1201 | .layer_cfg_almost_full_int = 1};
|
---|
| 1202 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 1203 | .calc_done_int = 1,
|
---|
| 1204 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1205 | .layer_cfg_almost_full_int = 1};
|
---|
[453] | 1206 | #if KPU_DEBUG
|
---|
[458] | 1207 | uint32_t cnt_layer_id = ctx->current_layer;
|
---|
| 1208 | uint64_t time = sysctl_get_time_us();
|
---|
| 1209 | if (last_time != 0)
|
---|
| 1210 | {
|
---|
| 1211 | uint64_t layer_time = time - last_time;
|
---|
| 1212 | syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
|
---|
| 1213 | total_time += layer_time;
|
---|
| 1214 | if (last_layer_type == KL_K210_CONV)
|
---|
| 1215 | kpu_time += layer_time;
|
---|
| 1216 | }
|
---|
[453] | 1217 |
|
---|
[458] | 1218 | syslog(LOG_NOTICE, "KPU: %d.%03d ms", kpu_time / 1000, kpu_time % 1000);
|
---|
| 1219 | syslog(LOG_NOTICE, "CPU: %d.%03d ms", (total_time - kpu_time) / 1000, (total_time - kpu_time) % 1000);
|
---|
| 1220 | syslog(LOG_NOTICE, "Model: %d.%03d ms", total_time / 1000, total_time % 1000);
|
---|
[453] | 1221 | #endif
|
---|
[458] | 1222 | ctx->done_callback(ctx->userdata);
|
---|
| 1223 | return 0;
|
---|
[453] | 1224 | }
|
---|
| 1225 |
|
---|
| 1226 | static int ai_step(void *userdata)
|
---|
| 1227 | {
|
---|
[458] | 1228 | kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
|
---|
[453] | 1229 |
|
---|
[458] | 1230 | uint32_t cnt_layer_id = ctx->current_layer;
|
---|
| 1231 | const uint8_t *layer_body = ctx->current_body;
|
---|
| 1232 | const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
|
---|
| 1233 | if (cnt_layer_id >= ctx->layers_length)
|
---|
| 1234 | {
|
---|
| 1235 | //syslog(LOG_NOTICE, "overrun");
|
---|
| 1236 | kpu_kmodel_done(ctx);
|
---|
| 1237 | return -1;
|
---|
| 1238 | }
|
---|
[453] | 1239 |
|
---|
[458] | 1240 | ctx->current_layer++;
|
---|
| 1241 | ctx->current_body += cnt_layer_header->body_size;
|
---|
[453] | 1242 |
|
---|
| 1243 | #if KPU_DEBUG
|
---|
[458] | 1244 | uint64_t time = sysctl_get_time_us();
|
---|
| 1245 | if (last_time != 0)
|
---|
| 1246 | {
|
---|
| 1247 | uint64_t layer_time = time - last_time;
|
---|
| 1248 | syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
|
---|
| 1249 | total_time += layer_time;
|
---|
| 1250 | if (last_layer_type == KL_K210_CONV)
|
---|
| 1251 | kpu_time += layer_time;
|
---|
| 1252 | }
|
---|
[453] | 1253 |
|
---|
[458] | 1254 | last_layer_type = cnt_layer_header->type;
|
---|
| 1255 | last_time = sysctl_get_time_us();
|
---|
[453] | 1256 | #endif
|
---|
| 1257 |
|
---|
[458] | 1258 | switch (cnt_layer_header->type)
|
---|
| 1259 | {
|
---|
| 1260 | case KL_ADD:
|
---|
| 1261 | kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
|
---|
| 1262 | break;
|
---|
| 1263 | case KL_QUANTIZED_ADD:
|
---|
| 1264 | kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
|
---|
| 1265 | break;
|
---|
| 1266 | case KL_GLOBAL_AVERAGE_POOL2D:
|
---|
| 1267 | kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
|
---|
| 1268 | break;
|
---|
| 1269 | case KL_QUANTIZED_MAX_POOL2D:
|
---|
| 1270 | kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
|
---|
| 1271 | break;
|
---|
| 1272 | case KL_AVERAGE_POOL2D:
|
---|
| 1273 | kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
|
---|
| 1274 | break;
|
---|
| 1275 | case KL_QUANTIZE:
|
---|
| 1276 | kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
|
---|
| 1277 | break;
|
---|
| 1278 | case KL_DEQUANTIZE:
|
---|
| 1279 | kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
|
---|
| 1280 | break;
|
---|
| 1281 | case KL_REQUANTIZE:
|
---|
| 1282 | kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
|
---|
| 1283 | break;
|
---|
| 1284 | case KL_L2_NORMALIZATION:
|
---|
| 1285 | kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
|
---|
| 1286 | break;
|
---|
| 1287 | case KL_SOFTMAX:
|
---|
| 1288 | kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
|
---|
| 1289 | break;
|
---|
| 1290 | case KL_CONCAT:
|
---|
| 1291 | case KL_QUANTIZED_CONCAT:
|
---|
| 1292 | kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
|
---|
| 1293 | break;
|
---|
| 1294 | case KL_FULLY_CONNECTED:
|
---|
| 1295 | kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
|
---|
| 1296 | break;
|
---|
| 1297 | case KL_TENSORFLOW_FLATTEN:
|
---|
| 1298 | kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
|
---|
| 1299 | break;
|
---|
| 1300 | case KL_RESIZE_NEAREST_NEIGHBOR:
|
---|
| 1301 | kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
|
---|
| 1302 | break;
|
---|
| 1303 | case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
|
---|
| 1304 | kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
|
---|
| 1305 | break;
|
---|
| 1306 | case KL_CHANNELWISE_DEQUANTIZE:
|
---|
| 1307 | kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
|
---|
| 1308 | break;
|
---|
| 1309 | case KL_LOGISTIC:
|
---|
| 1310 | kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
|
---|
| 1311 | break;
|
---|
| 1312 | case KL_K210_CONV:
|
---|
| 1313 | kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
|
---|
| 1314 | return 0;
|
---|
| 1315 | case KL_K210_ADD_PADDING:
|
---|
| 1316 | kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
|
---|
| 1317 | break;
|
---|
| 1318 | case KL_K210_REMOVE_PADDING:
|
---|
| 1319 | kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
|
---|
| 1320 | break;
|
---|
| 1321 | case KL_K210_UPLOAD:
|
---|
| 1322 | kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
|
---|
| 1323 | break;
|
---|
| 1324 | default:
|
---|
| 1325 | assert(!"Layer is not supported.");
|
---|
| 1326 | kpu_kmodel_done(ctx);
|
---|
| 1327 | return -1;
|
---|
| 1328 | }
|
---|
[453] | 1329 |
|
---|
[458] | 1330 | if (ctx->current_layer < (ctx->layers_length - 1))
|
---|
| 1331 | ai_step(userdata);
|
---|
| 1332 | else
|
---|
| 1333 | kpu_kmodel_done(ctx);
|
---|
| 1334 | return 0;
|
---|
[453] | 1335 | }
|
---|
| 1336 |
|
---|
| 1337 | static void ai_step_not_isr(void *userdata)
|
---|
| 1338 | {
|
---|
[458] | 1339 | dis_int(INTNO_DMAAI);
|
---|
| 1340 | dis_int(INTNO_AI);
|
---|
| 1341 |
|
---|
| 1342 | ai_step(userdata);
|
---|
| 1343 |
|
---|
| 1344 | ena_int(INTNO_DMAAI);
|
---|
| 1345 | ena_int(INTNO_AI);
|
---|
[453] | 1346 | }
|
---|
| 1347 |
|
---|
| 1348 | int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
|
---|
| 1349 | {
|
---|
[458] | 1350 | ctx->dma_ch = dma_ch;
|
---|
| 1351 | ctx->done_callback = done_callback;
|
---|
| 1352 | ctx->userdata = userdata;
|
---|
| 1353 | ctx->current_layer = 0;
|
---|
| 1354 | ctx->current_body = ctx->body_start;
|
---|
[453] | 1355 | #if KPU_DEBUG
|
---|
[458] | 1356 | last_time = 0;
|
---|
| 1357 | total_time = 0;
|
---|
| 1358 | kpu_time = 0;
|
---|
[453] | 1359 | #endif
|
---|
| 1360 |
|
---|
[458] | 1361 | kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
|
---|
| 1362 | kpu->interrupt_clear.reg = 7;
|
---|
| 1363 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
| 1364 | .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
|
---|
| 1365 | kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
|
---|
| 1366 | .eight_bit_mode = header->flags & 1};
|
---|
| 1367 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 1368 | .calc_done_int = 1,
|
---|
| 1369 | .layer_cfg_almost_empty_int = 0,
|
---|
| 1370 | .layer_cfg_almost_full_int = 1};
|
---|
[453] | 1371 |
|
---|
[458] | 1372 | //plic_set_priority(INTNO_AI, 1);
|
---|
| 1373 | plic_irq_register(INTNO_AI, ai_step, ctx);
|
---|
| 1374 | plic_irq_enable(INTNO_AI);
|
---|
[453] | 1375 |
|
---|
[458] | 1376 | const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
|
---|
[453] | 1377 |
|
---|
[458] | 1378 | switch (first_layer_header->type)
|
---|
| 1379 | {
|
---|
| 1380 | case KL_K210_CONV:
|
---|
| 1381 | {
|
---|
| 1382 | const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
|
---|
| 1383 | kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
|
---|
[453] | 1384 |
|
---|
[458] | 1385 | if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
|
---|
| 1386 | {
|
---|
| 1387 | kpu_kmodel_input_with_padding(&layer_arg, src);
|
---|
| 1388 | ai_step_not_isr(ctx);
|
---|
| 1389 | }
|
---|
| 1390 | else
|
---|
| 1391 | {
|
---|
| 1392 | kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
|
---|
| 1393 | }
|
---|
| 1394 | }
|
---|
| 1395 | break;
|
---|
| 1396 | case KL_FULLY_CONNECTED:
|
---|
| 1397 | {
|
---|
| 1398 | const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
|
---|
| 1399 | kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
|
---|
| 1400 | ai_step_not_isr(ctx);
|
---|
| 1401 | }
|
---|
| 1402 | break;
|
---|
| 1403 | default:
|
---|
| 1404 | return -1;
|
---|
| 1405 | }
|
---|
[453] | 1406 |
|
---|
[458] | 1407 | return 0;
|
---|
[453] | 1408 | }
|
---|
[458] | 1409 |
|
---|
| 1410 | ER kpu_init(kpu_model_context_t *ctx)
|
---|
| 1411 | {
|
---|
| 1412 | g_ai_hdma.chnum = AI_DMA_CH;
|
---|
| 1413 | g_ai_hdma.xfercallback = ai_dma_done_isr;
|
---|
| 1414 | g_ai_hdma.errorcallback = NULL;
|
---|
| 1415 | g_ai_hdma.Init.Request = DMA_SELECT_AI_RX_REQ; /* DMA選択 */
|
---|
| 1416 | g_ai_hdma.Init.Direction = DMA_PERIPH_TO_MEMORY; /* DMA転送方向 */
|
---|
| 1417 | g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT; /* ソースマルチブロックタイプ */
|
---|
| 1418 | g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT; /* デスティネーションマルチブロックタイプ */
|
---|
| 1419 | g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE; /* ソースハンドシェイク */
|
---|
| 1420 | g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE; /* デスティネーションハンドシェイク */
|
---|
| 1421 | g_ai_hdma.Init.SrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* ソースハードウェアハンドシェイク極性 */
|
---|
| 1422 | g_ai_hdma.Init.DrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* デスティネーションハードウェアハンドシェイク極性 */
|
---|
| 1423 | g_ai_hdma.Init.Priority = 4; /* 優先度 */
|
---|
| 1424 | g_ai_hdma.Init.SrcMaster = DMAC_MASTER1; /* ソースマスター設定 */
|
---|
| 1425 | g_ai_hdma.Init.DstMaster = DMAC_MASTER2; /* デスティネーションマスター設定 */
|
---|
| 1426 | g_ai_hdma.Init.SrcInc = DMAC_ADDR_NOCHANGE; /* ソースインクリメント設定 */
|
---|
| 1427 | g_ai_hdma.Init.DstInc = DMAC_ADDR_INCREMENT; /* デスティネーションインクリメント設定 */
|
---|
| 1428 | g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32; /* ソース転送幅 */
|
---|
| 1429 | g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32; /* デスティネーション転送幅 */
|
---|
| 1430 | g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4; /* ソースバーストサイズ */
|
---|
| 1431 | g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4; /* デスティネーションバーストサイズ */
|
---|
| 1432 | g_ai_hdma.Init.IocBlkTrans = 0; /* IOCブロック転送 */
|
---|
| 1433 | g_ai_hdma.localdata = (void *)ctx;
|
---|
| 1434 |
|
---|
| 1435 | return dma_init(&g_ai_hdma);
|
---|
| 1436 | }
|
---|