[453] | 1 | #include <assert.h>
|
---|
| 2 | #include <float.h>
|
---|
| 3 | #include <math.h>
|
---|
| 4 | #include <stdio.h>
|
---|
| 5 | #include <stdlib.h>
|
---|
| 6 | #include <string.h>
|
---|
| 7 | #include <stdint.h>
|
---|
| 8 | #include <kernel.h>
|
---|
| 9 | #include <t_syslog.h>
|
---|
| 10 | #include <t_stdlib.h>
|
---|
| 11 | #include <kernel_impl.h>
|
---|
| 12 | #include <target_syssvc.h>
|
---|
| 13 | #include "kendryte-k210.h"
|
---|
| 14 | #include "device.h"
|
---|
| 15 | #include "atomic.h"
|
---|
| 16 | #include "kpu.h"
|
---|
| 17 | #include "utils.h"
|
---|
| 18 | #include "kpu_main.h"
|
---|
| 19 |
|
---|
| 20 | #define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b))
|
---|
| 21 |
|
---|
| 22 | void sysctl_enable_irq(void)
|
---|
| 23 | {
|
---|
| 24 | set_csr(mie, MIP_MEIP);
|
---|
| 25 | set_csr(mstatus, MSTATUS_MIE);
|
---|
| 26 | }
|
---|
| 27 |
|
---|
| 28 | void sysctl_disable_irq(void)
|
---|
| 29 | {
|
---|
| 30 | clear_csr(mie, MIP_MEIP);
|
---|
| 31 | clear_csr(mstatus, MSTATUS_MIE);
|
---|
| 32 | }
|
---|
| 33 |
|
---|
| 34 | uint64_t sysctl_get_time_us(void)
|
---|
| 35 | {
|
---|
| 36 | uint64_t v_cycle = read_cycle();
|
---|
| 37 | return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
|
---|
| 38 | }
|
---|
| 39 |
|
---|
| 40 | static int is_memory(uintptr_t address)
|
---|
| 41 | {
|
---|
| 42 | enum
|
---|
| 43 | {
|
---|
| 44 | mem_len = 6 * 1024 * 1024,
|
---|
| 45 | mem_no_cache_len = 8 * 1024 * 1024,
|
---|
| 46 | };
|
---|
| 47 | return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040);
|
---|
| 48 | }
|
---|
| 49 |
|
---|
| 50 | uint32_t is_memory_cache(uintptr_t address)
|
---|
| 51 | {
|
---|
| 52 | #define MEM_CACHE_LEN (6 * 1024 * 1024)
|
---|
| 53 |
|
---|
| 54 | return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
|
---|
| 55 | }
|
---|
| 56 |
|
---|
| 57 | int plic_irq_enable(INTNO irq_number)
|
---|
| 58 | {
|
---|
| 59 | if (irq_number != INTNO_AI)
|
---|
| 60 | return -1;
|
---|
| 61 | ena_int(irq_number);
|
---|
| 62 | return 0;
|
---|
| 63 | }
|
---|
| 64 |
|
---|
| 65 | int plic_set_priority(INTNO irq_number, uint32_t priority)
|
---|
| 66 | {
|
---|
| 67 | if (irq_number != INTNO_AI)
|
---|
| 68 | return -1;
|
---|
| 69 | set_ipriority(irq_number, priority);
|
---|
| 70 | return 0;
|
---|
| 71 | }
|
---|
| 72 |
|
---|
| 73 | plic_irq_callback_t ai_done_callback;
|
---|
| 74 | void *ai_done_ctx;
|
---|
| 75 |
|
---|
| 76 | void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
|
---|
| 77 | {
|
---|
| 78 | ER ret;
|
---|
| 79 | if (irq != INTNO_AI)
|
---|
| 80 | return;
|
---|
| 81 |
|
---|
| 82 | ret = loc_cpu();
|
---|
| 83 |
|
---|
| 84 | ai_done_callback = callback;
|
---|
| 85 | ai_done_ctx = ctx;
|
---|
| 86 |
|
---|
| 87 | if (ret == E_OK)
|
---|
| 88 | unl_cpu();
|
---|
| 89 | }
|
---|
| 90 |
|
---|
| 91 | void ai_done_isr(intptr_t exinf)
|
---|
| 92 | {
|
---|
| 93 | sysctl_disable_irq();
|
---|
| 94 | if (ai_done_callback != NULL){
|
---|
| 95 | ai_done_callback(ai_done_ctx);
|
---|
| 96 | }
|
---|
| 97 | sysctl_enable_irq();
|
---|
| 98 | }
|
---|
| 99 |
|
---|
| 100 | plic_irq_callback_t ai_dma_done_callback;
|
---|
| 101 | void *ai_dma_done_ctx;
|
---|
| 102 |
|
---|
| 103 | void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
|
---|
| 104 | plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
|
---|
| 105 | {
|
---|
| 106 | ER ret;
|
---|
| 107 | if (channel_num != AI_DMA_CH)
|
---|
| 108 | return;
|
---|
| 109 |
|
---|
| 110 | set_ipriority(INTNO_DMAAI, priority);
|
---|
| 111 |
|
---|
| 112 | ret = loc_cpu();
|
---|
| 113 |
|
---|
| 114 | ai_dma_done_callback = dmac_callback;
|
---|
| 115 | ai_dma_done_ctx = ctx;
|
---|
| 116 |
|
---|
| 117 | if (ret == E_OK)
|
---|
| 118 | unl_cpu();
|
---|
| 119 | }
|
---|
| 120 |
|
---|
| 121 | void ai_dma_done_isr(DMA_Handle_t *dma)
|
---|
| 122 | {
|
---|
| 123 | sysctl_disable_irq();
|
---|
| 124 | if (ai_dma_done_callback != NULL) {
|
---|
| 125 | ai_dma_done_callback(ai_dma_done_ctx);
|
---|
| 126 | }
|
---|
| 127 | sysctl_enable_irq();
|
---|
| 128 | }
|
---|
| 129 |
|
---|
| 130 | void dmac_set_irq(dmac_channel_number_t channel_num,
|
---|
| 131 | plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
|
---|
| 132 | {
|
---|
| 133 | ER ret;
|
---|
| 134 | if (channel_num != AI_DMA_CH)
|
---|
| 135 | return;
|
---|
| 136 |
|
---|
| 137 | set_ipriority(INTNO_DMAAI, priority);
|
---|
| 138 |
|
---|
| 139 | ret = loc_cpu();
|
---|
| 140 |
|
---|
| 141 | ai_dma_done_callback = dmac_callback;
|
---|
| 142 | ai_dma_done_ctx = ctx;
|
---|
| 143 |
|
---|
| 144 | if (ret == E_OK)
|
---|
| 145 | unl_cpu();
|
---|
| 146 | }
|
---|
| 147 |
|
---|
| 148 | DMA_Handle_t g_ai_hdma;
|
---|
| 149 |
|
---|
| 150 | void dmac_set_single_mode(dmac_channel_number_t channel_num,
|
---|
| 151 | const void *src, void *dest, uint8_t src_inc,
|
---|
| 152 | uint8_t dest_inc,
|
---|
| 153 | uint8_t dmac_burst_size,
|
---|
| 154 | uint8_t dmac_trans_width,
|
---|
| 155 | size_t block_size)
|
---|
| 156 | {
|
---|
| 157 | if (channel_num != AI_DMA_CH)
|
---|
| 158 | return;
|
---|
| 159 |
|
---|
| 160 | DMA_Handle_t *hdma = &g_ai_hdma;
|
---|
| 161 | int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
|
---|
| 162 | uint8_t flow_control;
|
---|
| 163 | if(mem_type_src == 0 && mem_type_dest == 0)
|
---|
| 164 | flow_control = DMA_PERIPH_TO_PERIPH;
|
---|
| 165 | else if(mem_type_src == 1 && mem_type_dest == 0)
|
---|
| 166 | flow_control = DMA_MEMORY_TO_PERIPH;
|
---|
| 167 | else if(mem_type_src == 0 && mem_type_dest == 1)
|
---|
| 168 | flow_control = DMA_PERIPH_TO_MEMORY;
|
---|
| 169 | else
|
---|
| 170 | flow_control = DMA_MEMORY_TO_MEMORY;
|
---|
| 171 |
|
---|
| 172 | hdma->Init.Direction = flow_control; /* DMA転送方向 */
|
---|
| 173 | hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
|
---|
| 174 | hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
|
---|
| 175 | hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
|
---|
| 176 | hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
|
---|
| 177 | hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
|
---|
| 178 | hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
|
---|
| 179 | hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
|
---|
| 180 | hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
|
---|
| 181 | dma_reset(hdma);
|
---|
| 182 |
|
---|
| 183 | dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
|
---|
| 184 | }
|
---|
| 185 |
|
---|
| 186 | #define LAYER_BURST_SIZE 12
|
---|
| 187 |
|
---|
| 188 | #define KPU_DEBUG 0
|
---|
| 189 | #define USE_CACHED_AI_RAM 0
|
---|
| 190 |
|
---|
| 191 | #define min(a, b) (((a) < (b)) ? (a) : (b))
|
---|
| 192 | #define max(a, b) (((a) > (b)) ? (a) : (b))
|
---|
| 193 | #define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
|
---|
| 194 |
|
---|
| 195 | static int ai_step(void *userdata);
|
---|
| 196 | static int kpu_kmodel_done(kpu_model_context_t *ctx);
|
---|
| 197 |
|
---|
| 198 | volatile kpu_config_t *const kpu = (volatile kpu_config_t *)AI_BASE_ADDR;
|
---|
| 199 | static volatile uint32_t kpu_status;
|
---|
| 200 |
|
---|
| 201 | typedef struct kpu_context
|
---|
| 202 | {
|
---|
| 203 | kpu_task_t kpu_task;
|
---|
| 204 | uint32_t kpu_status;
|
---|
| 205 | } kpu_context_t;
|
---|
| 206 |
|
---|
| 207 | volatile kpu_context_t g_kpu_context;
|
---|
| 208 |
|
---|
| 209 | static int kpu_run_all_done(void *_task)
|
---|
| 210 | {
|
---|
| 211 | atomic_swap(&g_kpu_context.kpu_status, 0);
|
---|
| 212 | kpu_task_t *task = (kpu_task_t *)_task;
|
---|
| 213 | task->callback(task);
|
---|
| 214 | return 0;
|
---|
| 215 | }
|
---|
| 216 |
|
---|
| 217 | int kpu_continue(void *_task)
|
---|
| 218 | {
|
---|
| 219 | kpu_task_t *task = (kpu_task_t *)_task;
|
---|
| 220 | int layer_burst_size = 1;
|
---|
| 221 |
|
---|
| 222 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 223 | .calc_done_int = 1,
|
---|
| 224 | .layer_cfg_almost_empty_int = 1,
|
---|
| 225 | .layer_cfg_almost_full_int = 1};
|
---|
| 226 |
|
---|
| 227 | if(task->remain_layers_length == 0)
|
---|
| 228 | {
|
---|
| 229 | return 0;
|
---|
| 230 | }
|
---|
| 231 | if(task->remain_layers_length <= layer_burst_size)
|
---|
| 232 | {
|
---|
| 233 | for(uint32_t i = 0; i < task->remain_layers_length; i++)
|
---|
| 234 | {
|
---|
| 235 | kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
---|
| 236 | kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
---|
| 237 | kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
|
---|
| 238 | kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
|
---|
| 239 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
|
---|
| 240 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
|
---|
| 241 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
|
---|
| 242 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
|
---|
| 243 | kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
|
---|
| 244 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
|
---|
| 245 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
|
---|
| 246 | kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
|
---|
| 247 | }
|
---|
| 248 | task->remain_layers_length = 0;
|
---|
| 249 | } else
|
---|
| 250 | {
|
---|
| 251 | for(uint32_t i = 0; i < layer_burst_size; i++)
|
---|
| 252 | {
|
---|
| 253 | kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
---|
| 254 | kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
---|
| 255 | kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
|
---|
| 256 | kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
|
---|
| 257 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
|
---|
| 258 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
|
---|
| 259 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
|
---|
| 260 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
|
---|
| 261 | kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
|
---|
| 262 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
|
---|
| 263 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
|
---|
| 264 | kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
|
---|
| 265 | }
|
---|
| 266 | task->remain_layers += layer_burst_size;
|
---|
| 267 | task->remain_layers_length -= layer_burst_size;
|
---|
| 268 | }
|
---|
| 269 | return 0;
|
---|
| 270 | }
|
---|
| 271 |
|
---|
| 272 | static int kpu_run_dma_output(uint32_t dma_ch, void *dst, uint32_t length, plic_irq_callback_t cb, void *_task)
|
---|
| 273 | {
|
---|
| 274 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
| 275 | kpu_dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);
|
---|
| 276 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
| 277 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
|
---|
| 278 | return 0;
|
---|
| 279 | }
|
---|
| 280 |
|
---|
| 281 | static int kpu_run_dma_input_done_push_layers(void *_task)
|
---|
| 282 | {
|
---|
| 283 | kpu_task_t *task = (kpu_task_t *)_task;
|
---|
| 284 | kpu->interrupt_clear.reg = 7;
|
---|
| 285 | dma_end(&g_ai_hdma);
|
---|
| 286 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
| 287 | .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
|
---|
| 288 | kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
|
---|
| 289 | .eight_bit_mode = task->eight_bit_mode};
|
---|
| 290 |
|
---|
| 291 | kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
|
---|
| 292 |
|
---|
| 293 | kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
|
---|
| 294 |
|
---|
| 295 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 296 | .calc_done_int = 0,
|
---|
| 297 | .layer_cfg_almost_empty_int = 0,
|
---|
| 298 | .layer_cfg_almost_full_int = 1};
|
---|
| 299 | kpu_continue(task);
|
---|
| 300 | return 0;
|
---|
| 301 | }
|
---|
| 302 |
|
---|
| 303 | static void kpu_run_dma_input(uint32_t dma_ch, const void *src, plic_irq_callback_t cb, void *_task)
|
---|
| 304 | {
|
---|
| 305 | kpu_task_t *task = _task;
|
---|
| 306 | kpu_layer_argument_t *first_layer = &task->layers[0];
|
---|
| 307 | uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
|
---|
| 308 | kpu_dmac_irq_register(dma_ch, cb, _task, 1);
|
---|
| 309 | dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
---|
| 310 | DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
|
---|
| 311 | }
|
---|
| 312 |
|
---|
| 313 | int kpu_run(kpu_task_t *v_task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback)
|
---|
| 314 | {
|
---|
| 315 | if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
|
---|
| 316 | return -1;
|
---|
| 317 |
|
---|
| 318 | memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
|
---|
| 319 | kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;
|
---|
| 320 |
|
---|
| 321 | kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
|
---|
| 322 |
|
---|
| 323 | uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
|
---|
| 324 |
|
---|
| 325 | last_layer->dma_parameter.data.send_data_out = 1;
|
---|
| 326 | last_layer->interrupt_enabe.data.int_en = 1;
|
---|
| 327 |
|
---|
| 328 | task->dma_ch = dma_ch;
|
---|
| 329 | task->dst = dest;
|
---|
| 330 | task->dst_length = output_size;
|
---|
| 331 | task->callback = callback;
|
---|
| 332 | task->remain_layers_length = task->layers_length;
|
---|
| 333 | task->remain_layers = task->layers;
|
---|
| 334 |
|
---|
| 335 | plic_set_priority(INTNO_AI, 1);
|
---|
| 336 | plic_irq_register(INTNO_AI, kpu_continue, task);
|
---|
| 337 | plic_irq_enable(INTNO_AI);
|
---|
| 338 |
|
---|
| 339 | kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);
|
---|
| 340 |
|
---|
| 341 | return 0;
|
---|
| 342 | }
|
---|
| 343 |
|
---|
| 344 | uint8_t *kpu_get_output_buf(kpu_task_t *task)
|
---|
| 345 | {
|
---|
| 346 | kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
|
---|
| 347 | size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
|
---|
| 348 | return malloc(output_size);
|
---|
| 349 | }
|
---|
| 350 |
|
---|
| 351 | void kpu_release_output_buf(uint8_t *output_buf)
|
---|
| 352 | {
|
---|
| 353 | if(output_buf != NULL)
|
---|
| 354 | free(output_buf);
|
---|
| 355 | }
|
---|
| 356 |
|
---|
| 357 | static int kpu_done(void *ctx)
|
---|
| 358 | {
|
---|
| 359 | atomic_swap(&kpu_status, 0);
|
---|
| 360 | kpu_task_t *task = (kpu_task_t *)ctx;
|
---|
| 361 | task->callback(task->ctx);
|
---|
| 362 | return 0;
|
---|
| 363 | }
|
---|
| 364 |
|
---|
| 365 | static int kpu_config_input(void *ctx)
|
---|
| 366 | {
|
---|
| 367 | kpu_task_t *task = (kpu_task_t *)ctx;
|
---|
| 368 | kpu->interrupt_clear.reg = 7;
|
---|
| 369 | if(task->remain_layers_length <= LAYER_BURST_SIZE)
|
---|
| 370 | {
|
---|
| 371 | for(uint32_t i = 0; i < task->remain_layers_length; i++)
|
---|
| 372 | {
|
---|
| 373 | kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
---|
| 374 | kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
---|
| 375 | kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
|
---|
| 376 | kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
|
---|
| 377 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
|
---|
| 378 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
|
---|
| 379 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
|
---|
| 380 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
|
---|
| 381 | kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
|
---|
| 382 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
|
---|
| 383 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
|
---|
| 384 | kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
|
---|
| 385 | }
|
---|
| 386 | task->remain_layers_length = 0;
|
---|
| 387 | kpu->interrupt_mask.reg = 7;
|
---|
| 388 | } else
|
---|
| 389 | {
|
---|
| 390 | for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)
|
---|
| 391 | {
|
---|
| 392 | kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
---|
| 393 | kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
---|
| 394 | kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
|
---|
| 395 | kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
|
---|
| 396 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
|
---|
| 397 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
|
---|
| 398 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
|
---|
| 399 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
|
---|
| 400 | kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
|
---|
| 401 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
|
---|
| 402 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
|
---|
| 403 | kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
|
---|
| 404 | }
|
---|
| 405 | task->remain_layers += LAYER_BURST_SIZE;
|
---|
| 406 | task->remain_layers_length -= LAYER_BURST_SIZE;
|
---|
| 407 | }
|
---|
| 408 | return 0;
|
---|
| 409 | }
|
---|
| 410 |
|
---|
| 411 | static void kpu_data_output(kpu_task_t *task)
|
---|
| 412 | {
|
---|
| 413 | select_dma_channel(task->dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
| 414 | kpu_dmac_irq_register(task->dma_ch, kpu_done, task, 1);
|
---|
| 415 | dmac_set_single_mode(task->dma_ch, (void *)(&kpu->fifo_data_out), (void *)(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
| 416 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);
|
---|
| 417 | }
|
---|
| 418 |
|
---|
| 419 | static int kpu_data_ready(void *ctx)
|
---|
| 420 | {
|
---|
| 421 | kpu_task_t *task = (kpu_task_t *)ctx;
|
---|
| 422 |
|
---|
| 423 | dma_end(&g_ai_hdma);
|
---|
| 424 | kpu_data_output(task);
|
---|
| 425 |
|
---|
| 426 | kpu->eight_bit_mode.reg = task->eight_bit_mode;
|
---|
| 427 | kpu->interrupt_mask.reg = 7;
|
---|
| 428 | kpu->interrupt_clear.reg = 7;
|
---|
| 429 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
| 430 | .fifo_full_threshold = 12, .fifo_empty_threshold = 1};
|
---|
| 431 |
|
---|
| 432 | plic_set_priority(INTNO_AI, 2);
|
---|
| 433 | plic_irq_register(INTNO_AI, kpu_config_input, task);
|
---|
| 434 | plic_irq_enable(INTNO_AI);
|
---|
| 435 | kpu_config_input(task);
|
---|
| 436 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 437 | .calc_done_int = 1,
|
---|
| 438 | .layer_cfg_almost_empty_int = 0,
|
---|
| 439 | .layer_cfg_almost_full_int = 1};
|
---|
| 440 | return 0;
|
---|
| 441 | }
|
---|
| 442 |
|
---|
| 443 | static void kpu_data_input(kpu_task_t *task)
|
---|
| 444 | {
|
---|
| 445 | if(task->src == NULL)
|
---|
| 446 | {
|
---|
| 447 | kpu_data_ready(task);
|
---|
| 448 | return;
|
---|
| 449 | }
|
---|
| 450 | kpu_dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);
|
---|
| 451 | kpu_layer_argument_t *layer = &task->layers[0];
|
---|
| 452 | dmac_set_single_mode(task->dma_ch, (void *)(uintptr_t)task->src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
---|
| 453 | DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);
|
---|
| 454 | }
|
---|
| 455 |
|
---|
| 456 | int kpu_single_task_init(kpu_task_t *task)
|
---|
| 457 | {
|
---|
| 458 | /*
|
---|
| 459 | * AIクロック有効化
|
---|
| 460 | */
|
---|
| 461 | sil_orw_mem((uint32_t *)(TADR_SYSCTL_BASE+TOFF_SYSCTL_CLK_EN_PERI), SYSCTL_CLK_EN_PERI_AI_CLK_EN);
|
---|
| 462 |
|
---|
| 463 | kpu_layer_argument_t *first_layer = &task->layers[0];
|
---|
| 464 | kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
|
---|
| 465 |
|
---|
| 466 | last_layer->dma_parameter.data.send_data_out = 1;
|
---|
| 467 | last_layer->interrupt_enabe.data.int_en = 1;
|
---|
| 468 | task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;
|
---|
| 469 | task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;
|
---|
| 470 | task->dst = (uint64_t *)malloc(task->dst_length * 8);
|
---|
| 471 | if(task->dst == NULL)
|
---|
| 472 | return 1;
|
---|
| 473 | memset(task->dst, 0, task->dst_length * 8);
|
---|
| 474 | return 0;
|
---|
| 475 | }
|
---|
| 476 |
|
---|
| 477 | int kpu_single_task_deinit(kpu_task_t *task)
|
---|
| 478 | {
|
---|
| 479 | free(task->dst);
|
---|
| 480 | return 0;
|
---|
| 481 | }
|
---|
| 482 |
|
---|
| 483 | int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta)
|
---|
| 484 | {
|
---|
| 485 | uintptr_t base_addr = (uintptr_t)buffer;
|
---|
| 486 | kpu_model_header_t *header = (kpu_model_header_t *)buffer;
|
---|
| 487 | kpu_model_layer_metadata_t *layer_meta = (kpu_model_layer_metadata_t *)(base_addr + sizeof(kpu_model_header_t));
|
---|
| 488 | kpu_layer_argument_t *layers = (kpu_layer_argument_t *)(base_addr + header->layers_argument_start);
|
---|
| 489 |
|
---|
| 490 | if(header->version != 1)
|
---|
| 491 | return -1;
|
---|
| 492 | uint32_t layers_length = header->layers_length;
|
---|
| 493 | task->layers_length = layers_length;
|
---|
| 494 | task->eight_bit_mode = header->flags & 1;
|
---|
| 495 | task->layers = layers;
|
---|
| 496 | task->output_scale = layer_meta[layers_length - 1].output_scale;
|
---|
| 497 | task->output_bias = layer_meta[layers_length - 1].output_bias;
|
---|
| 498 | size_t i;
|
---|
| 499 | for(i = 0; i < layers_length; i++)
|
---|
| 500 | {
|
---|
| 501 | layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);
|
---|
| 502 | layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);
|
---|
| 503 | layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);
|
---|
| 504 | }
|
---|
| 505 |
|
---|
| 506 | if(meta)
|
---|
| 507 | *meta = layer_meta;
|
---|
| 508 | return 0;
|
---|
| 509 | }
|
---|
| 510 |
|
---|
| 511 | int kpu_start(kpu_task_t *task)
|
---|
| 512 | {
|
---|
| 513 | if(atomic_cas(&kpu_status, 0, 1))
|
---|
| 514 | return -1;
|
---|
| 515 |
|
---|
| 516 | task->remain_layers_length = task->layers_length;
|
---|
| 517 | task->remain_layers = task->layers;
|
---|
| 518 | kpu_data_input(task);
|
---|
| 519 | return 0;
|
---|
| 520 | }
|
---|
| 521 |
|
---|
| 522 | static void kpu_send_layer(const kpu_layer_argument_t *layer)
|
---|
| 523 | {
|
---|
| 524 | kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
|
---|
| 525 | kpu->layer_argument_fifo = layer->image_addr.reg;
|
---|
| 526 | kpu->layer_argument_fifo = layer->image_channel_num.reg;
|
---|
| 527 | kpu->layer_argument_fifo = layer->image_size.reg;
|
---|
| 528 | kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
|
---|
| 529 | kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
|
---|
| 530 | kpu->layer_argument_fifo = layer->kernel_offset.reg;
|
---|
| 531 | kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
|
---|
| 532 | kpu->layer_argument_fifo = layer->write_back_cfg.reg;
|
---|
| 533 | kpu->layer_argument_fifo = layer->conv_value.reg;
|
---|
| 534 | kpu->layer_argument_fifo = layer->conv_value2.reg;
|
---|
| 535 | kpu->layer_argument_fifo = layer->dma_parameter.reg;
|
---|
| 536 | }
|
---|
| 537 |
|
---|
| 538 | void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
|
---|
| 539 | {
|
---|
| 540 | kpu->interrupt_clear.reg = 7;
|
---|
| 541 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
| 542 | .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
|
---|
| 543 | kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
|
---|
| 544 | .eight_bit_mode = eight_bit_mode};
|
---|
| 545 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 546 | .calc_done_int = 1,
|
---|
| 547 | .layer_cfg_almost_empty_int = 0,
|
---|
| 548 | .layer_cfg_almost_full_int = 1};
|
---|
| 549 |
|
---|
| 550 | plic_set_priority(INTNO_AI, 1);
|
---|
| 551 | plic_irq_register(INTNO_AI, callback, userdata);
|
---|
| 552 | plic_irq_enable(INTNO_AI);
|
---|
| 553 | }
|
---|
| 554 |
|
---|
| 555 | void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
|
---|
| 556 | {
|
---|
| 557 | uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
|
---|
| 558 | dmac_set_irq(dma_ch, callback, userdata, 1);
|
---|
| 559 | dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
---|
| 560 | DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
|
---|
| 561 | }
|
---|
| 562 |
|
---|
| 563 | static void kpu_conv2d_core(kpu_layer_argument_t *layer)
|
---|
| 564 | {
|
---|
| 565 | kpu_send_layer(layer);
|
---|
| 566 | }
|
---|
| 567 |
|
---|
| 568 | void kpu_conv2d(kpu_layer_argument_t *layer)
|
---|
| 569 | {
|
---|
| 570 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 571 | .calc_done_int = 1,
|
---|
| 572 | .layer_cfg_almost_empty_int = 1,
|
---|
| 573 | .layer_cfg_almost_full_int = 1};
|
---|
| 574 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 575 | .calc_done_int = 1,
|
---|
| 576 | .layer_cfg_almost_empty_int = 0,
|
---|
| 577 | .layer_cfg_almost_full_int = 1};
|
---|
| 578 | kpu_conv2d_core(layer);
|
---|
| 579 | }
|
---|
| 580 |
|
---|
| 581 | void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
|
---|
| 582 | {
|
---|
| 583 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 584 | .calc_done_int = 1,
|
---|
| 585 | .layer_cfg_almost_empty_int = 1,
|
---|
| 586 | .layer_cfg_almost_full_int = 1};
|
---|
| 587 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 588 | .calc_done_int = 1,
|
---|
| 589 | .layer_cfg_almost_empty_int = 1,
|
---|
| 590 | .layer_cfg_almost_full_int = 1};
|
---|
| 591 | layer->dma_parameter.data.send_data_out = 1;
|
---|
| 592 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
| 593 | dmac_set_irq(dma_ch, callback, userdata, 1);
|
---|
| 594 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
| 595 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
|
---|
| 596 | kpu_conv2d_core(layer);
|
---|
| 597 | }
|
---|
| 598 |
|
---|
| 599 | void kpu_conv2d_output_full_add(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint64_t *dest, plic_irq_callback_t callback, void *userdata)
|
---|
| 600 | {
|
---|
| 601 | uint32_t channels = layer->image_channel_num.data.o_ch_num + 1;
|
---|
| 602 | layer->interrupt_enabe.data.full_add = 1;
|
---|
| 603 |
|
---|
| 604 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 605 | .calc_done_int = 1,
|
---|
| 606 | .layer_cfg_almost_empty_int = 1,
|
---|
| 607 | .layer_cfg_almost_full_int = 1};
|
---|
| 608 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 609 | .calc_done_int = 1,
|
---|
| 610 | .layer_cfg_almost_empty_int = 1,
|
---|
| 611 | .layer_cfg_almost_full_int = 1};
|
---|
| 612 | layer->dma_parameter.data.send_data_out = 1;
|
---|
| 613 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
| 614 | dmac_set_irq(dma_ch, callback, userdata, 1);
|
---|
| 615 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
| 616 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels);
|
---|
| 617 | kpu_conv2d_core(layer);
|
---|
| 618 | }
|
---|
| 619 |
|
---|
| 620 | void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, size_t count, uint8_t *dest, const quantize_param_t *dest_param)
|
---|
| 621 | {
|
---|
| 622 | quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param;
|
---|
| 623 |
|
---|
| 624 | size_t i;
|
---|
| 625 | for(i = 0; i < count; i++)
|
---|
| 626 | {
|
---|
| 627 | int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale;
|
---|
| 628 | if(value < 0)
|
---|
| 629 | value = 0;
|
---|
| 630 | if(value > 0xFF)
|
---|
| 631 | value = 0xFF;
|
---|
| 632 | *dest++ = value;
|
---|
| 633 | }
|
---|
| 634 | }
|
---|
| 635 |
|
---|
| 636 | void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
|
---|
| 637 | {
|
---|
| 638 | quantize_param_t q1 = *src_param, q2 = *dest_param;
|
---|
| 639 | size_t oc, y, x;
|
---|
| 640 |
|
---|
| 641 | if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
|
---|
| 642 | {
|
---|
| 643 | uint32_t row_padding = 16;
|
---|
| 644 | uint32_t row_group = 4;
|
---|
| 645 | uint32_t row_length = 1;
|
---|
| 646 | uint32_t height = 4;
|
---|
| 647 |
|
---|
| 648 | for(oc = 0; oc < channels; oc++)
|
---|
| 649 | {
|
---|
| 650 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
| 651 | for(y = 0; y < 1; y++)
|
---|
| 652 | {
|
---|
| 653 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
| 654 | for(x = 0; x < 1; x++)
|
---|
| 655 | {
|
---|
| 656 | int64_t sum = 0;
|
---|
| 657 | size_t i;
|
---|
| 658 | for(i = 0; i < kernel_size; i++)
|
---|
| 659 | sum += *src++;
|
---|
| 660 |
|
---|
| 661 | int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
|
---|
| 662 | if(value < 0)
|
---|
| 663 | value = 0;
|
---|
| 664 | if(value > 0xFF)
|
---|
| 665 | value = 0xFF;
|
---|
| 666 | y_origin[x] = value;
|
---|
| 667 | }
|
---|
| 668 | }
|
---|
| 669 | }
|
---|
| 670 | } else
|
---|
| 671 | {
|
---|
| 672 | for(oc = 0; oc < channels; oc++)
|
---|
| 673 | {
|
---|
| 674 | int64_t sum = 0;
|
---|
| 675 | size_t i;
|
---|
| 676 | for(i = 0; i < kernel_size; i++)
|
---|
| 677 | sum += *src++;
|
---|
| 678 |
|
---|
| 679 | int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
|
---|
| 680 | if(value < 0)
|
---|
| 681 | value = 0;
|
---|
| 682 | if(value > 0xFF)
|
---|
| 683 | value = 0xFF;
|
---|
| 684 | dest[oc] = value;
|
---|
| 685 | }
|
---|
| 686 | }
|
---|
| 687 | }
|
---|
| 688 |
|
---|
| 689 | void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
|
---|
| 690 | {
|
---|
| 691 | quantize_param_t q = *src_param;
|
---|
| 692 | size_t oc;
|
---|
| 693 |
|
---|
| 694 | for(oc = 0; oc < channels; oc++)
|
---|
| 695 | {
|
---|
| 696 | int64_t sum = 0;
|
---|
| 697 | size_t i;
|
---|
| 698 | for(i = 0; i < kernel_size; i++)
|
---|
| 699 | sum += *src++;
|
---|
| 700 |
|
---|
| 701 | float value = (sum * q.scale + q.bias) / kernel_size;
|
---|
| 702 | dest[oc] = value;
|
---|
| 703 | }
|
---|
| 704 | }
|
---|
| 705 |
|
---|
| 706 | void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param)
|
---|
| 707 | {
|
---|
| 708 | quantize_param_t q1 = *dest_param;
|
---|
| 709 | size_t i = 0;
|
---|
| 710 | for(i = 0; i < channels; i++)
|
---|
| 711 | *dest++ = src[i * 16] * q1.scale + q1.bias;
|
---|
| 712 | }
|
---|
| 713 |
|
---|
| 714 | void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels)
|
---|
| 715 | {
|
---|
| 716 | int ic, oc;
|
---|
| 717 | for(oc = 0; oc < output_channels; oc++)
|
---|
| 718 | {
|
---|
| 719 | const float *c_weights = weights + oc * input_channels;
|
---|
| 720 |
|
---|
| 721 | float sum = 0.0f;
|
---|
| 722 | for(ic = 0; ic < input_channels; ic++)
|
---|
| 723 | sum += src[ic] * c_weights[ic];
|
---|
| 724 | dest[oc] = sum + biases[oc];
|
---|
| 725 | }
|
---|
| 726 | }
|
---|
| 727 |
|
---|
| 728 | void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest)
|
---|
| 729 | {
|
---|
| 730 | quantize_param_t q1 = *src_param;
|
---|
| 731 | size_t i = 0;
|
---|
| 732 | for(i = 0; i < count; i++)
|
---|
| 733 | *dest++ = src[i] * q1.scale + q1.bias;
|
---|
| 734 | }
|
---|
| 735 |
|
---|
| 736 | void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels)
|
---|
| 737 | {
|
---|
| 738 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64);
|
---|
| 739 | size_t oc, y, x;
|
---|
| 740 |
|
---|
| 741 | uint32_t row_padding;
|
---|
| 742 | uint32_t row_group;
|
---|
| 743 | uint32_t row_length;
|
---|
| 744 |
|
---|
| 745 | if(width <= 16)
|
---|
| 746 | {
|
---|
| 747 | row_padding = 16;
|
---|
| 748 | row_group = 4;
|
---|
| 749 | row_length = 1;
|
---|
| 750 | } else if(width <= 32)
|
---|
| 751 | {
|
---|
| 752 | row_padding = 32;
|
---|
| 753 | row_group = 2;
|
---|
| 754 | row_length = 1;
|
---|
| 755 | } else
|
---|
| 756 | {
|
---|
| 757 | row_padding = 64;
|
---|
| 758 | row_group = 1;
|
---|
| 759 | row_length = (width + 63) / 64;
|
---|
| 760 | }
|
---|
| 761 |
|
---|
| 762 | for(oc = 0; oc < channels; oc++)
|
---|
| 763 | {
|
---|
| 764 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
| 765 | for(y = 0; y < height; y++)
|
---|
| 766 | {
|
---|
| 767 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
| 768 | for(x = 0; x < width; x++)
|
---|
| 769 | y_origin[x] = *src++;
|
---|
| 770 | }
|
---|
| 771 | }
|
---|
| 772 | }
|
---|
| 773 | #if USE_CACHED_AI_RAM
|
---|
| 774 | static void kpu_flush_cache(uint32_t addr, size_t lines)
|
---|
| 775 | {
|
---|
| 776 | size_t line;
|
---|
| 777 | for(line = 0; line < lines; line++)
|
---|
| 778 | {
|
---|
| 779 | const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
|
---|
| 780 | uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
|
---|
| 781 | size_t i;
|
---|
| 782 | for(i = 0; i < 8; i++)
|
---|
| 783 | dest[i] = src[i];
|
---|
| 784 | }
|
---|
| 785 | }
|
---|
| 786 | #endif
|
---|
| 787 | static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
|
---|
| 788 | {
|
---|
| 789 | if(shift > 0)
|
---|
| 790 | {
|
---|
| 791 | value >>= shift - 1;
|
---|
| 792 | if(value & 0x1)
|
---|
| 793 | {
|
---|
| 794 | if(value < 0)
|
---|
| 795 | value = (value >> 1) - 1;
|
---|
| 796 | else
|
---|
| 797 | value = (value >> 1) + 1;
|
---|
| 798 | } else
|
---|
| 799 | {
|
---|
| 800 | value >>= 1;
|
---|
| 801 | }
|
---|
| 802 | }
|
---|
| 803 |
|
---|
| 804 | return value;
|
---|
| 805 | }
|
---|
| 806 | static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
|
---|
| 807 | {
|
---|
| 808 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
|
---|
| 809 | size_t oc, y, x;
|
---|
| 810 | uint32_t row_padding;
|
---|
| 811 | uint32_t row_group;
|
---|
| 812 | uint32_t row_length;
|
---|
| 813 | if(width <= 16)
|
---|
| 814 | {
|
---|
| 815 | row_padding = 16;
|
---|
| 816 | row_group = 4;
|
---|
| 817 | row_length = 1;
|
---|
| 818 | } else if(width <= 32)
|
---|
| 819 | {
|
---|
| 820 | row_padding = 32;
|
---|
| 821 | row_group = 2;
|
---|
| 822 | row_length = 1;
|
---|
| 823 | } else
|
---|
| 824 | {
|
---|
| 825 | row_padding = 64;
|
---|
| 826 | row_group = 1;
|
---|
| 827 | row_length = (width + 63) / 64;
|
---|
| 828 | }
|
---|
| 829 |
|
---|
| 830 | if((uintptr_t)src % 8 == 0 && width % 8 == 0)
|
---|
| 831 | {
|
---|
| 832 | #define UPLOAD_BEGIN() \
|
---|
| 833 | for(oc = 0; oc < channels; oc++) \
|
---|
| 834 | { \
|
---|
| 835 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
|
---|
| 836 | for(y = 0; y < height; y++) \
|
---|
| 837 | { \
|
---|
| 838 | uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
|
---|
| 839 |
|
---|
| 840 | #define UPLOAD_END() \
|
---|
| 841 | } \
|
---|
| 842 | }
|
---|
| 843 |
|
---|
| 844 | width /= 8;
|
---|
| 845 | const uint64_t *u64_src = (const uint64_t *)src;
|
---|
| 846 | if(width == 1)
|
---|
| 847 | {
|
---|
| 848 | UPLOAD_BEGIN()
|
---|
| 849 | y_origin[0] = *u64_src++;
|
---|
| 850 | UPLOAD_END()
|
---|
| 851 | } else if(width == 2)
|
---|
| 852 | {
|
---|
| 853 | UPLOAD_BEGIN()
|
---|
| 854 | {
|
---|
| 855 | y_origin[0] = *u64_src++;
|
---|
| 856 | y_origin[1] = *u64_src++;
|
---|
| 857 | }
|
---|
| 858 | UPLOAD_END()
|
---|
| 859 | } else if(width == 4)
|
---|
| 860 | {
|
---|
| 861 | UPLOAD_BEGIN()
|
---|
| 862 | {
|
---|
| 863 | y_origin[0] = *u64_src++;
|
---|
| 864 | y_origin[1] = *u64_src++;
|
---|
| 865 | y_origin[2] = *u64_src++;
|
---|
| 866 | y_origin[3] = *u64_src++;
|
---|
| 867 | }
|
---|
| 868 | UPLOAD_END()
|
---|
| 869 | } else
|
---|
| 870 | {
|
---|
| 871 | UPLOAD_BEGIN()
|
---|
| 872 | for(x = 0; x < width; x++)
|
---|
| 873 | y_origin[x] = *u64_src++;
|
---|
| 874 | UPLOAD_END()
|
---|
| 875 | }
|
---|
| 876 | } else
|
---|
| 877 | {
|
---|
| 878 | for(oc = 0; oc < channels; oc++)
|
---|
| 879 | {
|
---|
| 880 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
| 881 | for(y = 0; y < height; y++)
|
---|
| 882 | {
|
---|
| 883 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
| 884 | for(x = 0; x < width; x++)
|
---|
| 885 | y_origin[x] = *src++;
|
---|
| 886 | }
|
---|
| 887 | }
|
---|
| 888 | }
|
---|
| 889 | }
|
---|
| 890 | static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
|
---|
| 891 | {
|
---|
| 892 | size_t width = layer->image_size.data.i_row_wid + 1;
|
---|
| 893 | size_t height = layer->image_size.data.i_col_high + 1;
|
---|
| 894 | size_t channels = layer->image_channel_num.data.i_ch_num + 1;
|
---|
| 895 |
|
---|
| 896 | kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
|
---|
| 897 | }
|
---|
| 898 |
|
---|
| 899 | static void kpu_kmodel_input_float(const float *src, float *dest, size_t count)
|
---|
| 900 | {
|
---|
| 901 | memcpy(dest, src, count * sizeof(float));
|
---|
| 902 | }
|
---|
| 903 |
|
---|
| 904 | static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
|
---|
| 905 | {
|
---|
| 906 | size_t i;
|
---|
| 907 |
|
---|
| 908 | if(act == KLA_RELU)
|
---|
| 909 | {
|
---|
| 910 | for(i = 0; i < count; i++)
|
---|
| 911 | data[i] = max(data[i], 0);
|
---|
| 912 | } else if(act == KLA_RELU6)
|
---|
| 913 | {
|
---|
| 914 | for(i = 0; i < count; i++)
|
---|
| 915 | data[i] = min(max(data[i], 0), 6);
|
---|
| 916 | }
|
---|
| 917 | }
|
---|
| 918 |
|
---|
| 919 | static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 920 | {
|
---|
| 921 | const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
|
---|
| 922 | const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
|
---|
| 923 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 924 | size_t i, count = arg->count;
|
---|
| 925 |
|
---|
| 926 | for(i = 0; i < count; i++)
|
---|
| 927 | dest[i] = src_a[i] + src_b[i];
|
---|
| 928 | }
|
---|
| 929 |
|
---|
| 930 | static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 931 | {
|
---|
| 932 | const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
|
---|
| 933 | const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
|
---|
| 934 | size_t count = ALIGN_UP(arg->count, 8) / 8;
|
---|
| 935 | int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
|
---|
| 936 | int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
|
---|
| 937 | int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
|
---|
| 938 |
|
---|
| 939 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 940 | size_t i;
|
---|
| 941 |
|
---|
| 942 | if(sh_a == sh_b)
|
---|
| 943 | {
|
---|
| 944 | #define QADD_UNROLL_1(x) \
|
---|
| 945 | int64_t a##x = *src_a++; \
|
---|
| 946 | int64_t b##x = *src_b++;
|
---|
| 947 |
|
---|
| 948 | #define QADD_UNROLL_2(x) \
|
---|
| 949 | a##x += off_a; \
|
---|
| 950 | b##x += off_b;
|
---|
| 951 |
|
---|
| 952 | #define QADD_UNROLL_3(x) \
|
---|
| 953 | a##x *= mul_a; \
|
---|
| 954 | b##x *= mul_b;
|
---|
| 955 |
|
---|
| 956 | #define QADD_UNROLL_4(x) \
|
---|
| 957 | int64_t v##x = a##x + b##x;
|
---|
| 958 |
|
---|
| 959 | #define QADD_UNROLL_5(x) \
|
---|
| 960 | v##x >>= sh_a;
|
---|
| 961 |
|
---|
| 962 | #define QADD_UNROLL_6(x) \
|
---|
| 963 | v##x *= mul_o;
|
---|
| 964 |
|
---|
| 965 | #define QADD_UNROLL_7(x) \
|
---|
| 966 | v##x = kpu_carry_shift(v##x, sh_o);
|
---|
| 967 |
|
---|
| 968 | #define QADD_UNROLL_8(x) \
|
---|
| 969 | v##x += off_o;
|
---|
| 970 |
|
---|
| 971 | #define QADD_UNROLL_9(x) \
|
---|
| 972 | v##x = min(0xFF, max(0, v##x));
|
---|
| 973 |
|
---|
| 974 | #define QADD_UNROLL_10(x) \
|
---|
| 975 | *dest++ = v##x;
|
---|
| 976 |
|
---|
| 977 | #define QADD_UNROLL_S(x) \
|
---|
| 978 | QADD_UNROLL_##x(0) \
|
---|
| 979 | QADD_UNROLL_##x(1) \
|
---|
| 980 | QADD_UNROLL_##x(2) \
|
---|
| 981 | QADD_UNROLL_##x(3) \
|
---|
| 982 | QADD_UNROLL_##x(4) \
|
---|
| 983 | QADD_UNROLL_##x(5) \
|
---|
| 984 | QADD_UNROLL_##x(6) \
|
---|
| 985 | QADD_UNROLL_##x(7)
|
---|
| 986 |
|
---|
| 987 | for(i = 0; i < count; i++)
|
---|
| 988 | {
|
---|
| 989 | QADD_UNROLL_S(1);
|
---|
| 990 | QADD_UNROLL_S(2);
|
---|
| 991 | QADD_UNROLL_S(3);
|
---|
| 992 | QADD_UNROLL_S(4);
|
---|
| 993 | QADD_UNROLL_S(5);
|
---|
| 994 | QADD_UNROLL_S(6);
|
---|
| 995 | QADD_UNROLL_S(7);
|
---|
| 996 | QADD_UNROLL_S(8);
|
---|
| 997 | QADD_UNROLL_S(9);
|
---|
| 998 | QADD_UNROLL_S(10);
|
---|
| 999 | }
|
---|
| 1000 | } else
|
---|
| 1001 | {
|
---|
| 1002 | #undef QADD_UNROLL_1
|
---|
| 1003 | #define QADD_UNROLL_1(x) \
|
---|
| 1004 | int64_t a##x = *src_a++; \
|
---|
| 1005 | int64_t b##x = *src_b++;
|
---|
| 1006 |
|
---|
| 1007 | #undef QADD_UNROLL_2
|
---|
| 1008 | #define QADD_UNROLL_2(x) \
|
---|
| 1009 | a##x += off_a; \
|
---|
| 1010 | b##x += off_b;
|
---|
| 1011 |
|
---|
| 1012 | #undef QADD_UNROLL_3
|
---|
| 1013 | #define QADD_UNROLL_3(x) \
|
---|
| 1014 | a##x *= mul_a; \
|
---|
| 1015 | b##x *= mul_b;
|
---|
| 1016 |
|
---|
| 1017 | #undef QADD_UNROLL_4
|
---|
| 1018 | #define QADD_UNROLL_4(x) \
|
---|
| 1019 | a##x >>= sh_a; \
|
---|
| 1020 | b##x >>= sh_b;
|
---|
| 1021 |
|
---|
| 1022 | #undef QADD_UNROLL_5
|
---|
| 1023 | #define QADD_UNROLL_5(x) \
|
---|
| 1024 | int64_t v##x = a##x + b##x;
|
---|
| 1025 |
|
---|
| 1026 | #undef QADD_UNROLL_6
|
---|
| 1027 | #define QADD_UNROLL_6(x) \
|
---|
| 1028 | v##x *= mul_o;
|
---|
| 1029 |
|
---|
| 1030 | #undef QADD_UNROLL_7
|
---|
| 1031 | #define QADD_UNROLL_7(x) \
|
---|
| 1032 | v##x = kpu_carry_shift(v##x, sh_o);
|
---|
| 1033 |
|
---|
| 1034 | #undef QADD_UNROLL_8
|
---|
| 1035 | #define QADD_UNROLL_8(x) \
|
---|
| 1036 | v##x += off_o;
|
---|
| 1037 |
|
---|
| 1038 | #undef QADD_UNROLL_9
|
---|
| 1039 | #define QADD_UNROLL_9(x) \
|
---|
| 1040 | v##x = min(0xFF, max(0, v##x));
|
---|
| 1041 |
|
---|
| 1042 | #undef QADD_UNROLL_10
|
---|
| 1043 | #define QADD_UNROLL_10(x) \
|
---|
| 1044 | *dest++ = v##x;
|
---|
| 1045 |
|
---|
| 1046 | #undef QADD_UNROLL_S
|
---|
| 1047 | #define QADD_UNROLL_S(x) \
|
---|
| 1048 | QADD_UNROLL_##x(0) \
|
---|
| 1049 | QADD_UNROLL_##x(1) \
|
---|
| 1050 | QADD_UNROLL_##x(2) \
|
---|
| 1051 | QADD_UNROLL_##x(3) \
|
---|
| 1052 | QADD_UNROLL_##x(4) \
|
---|
| 1053 | QADD_UNROLL_##x(5) \
|
---|
| 1054 | QADD_UNROLL_##x(6) \
|
---|
| 1055 | QADD_UNROLL_##x(7)
|
---|
| 1056 |
|
---|
| 1057 | for(i = 0; i < count; i++)
|
---|
| 1058 | {
|
---|
| 1059 | QADD_UNROLL_S(1);
|
---|
| 1060 | QADD_UNROLL_S(2);
|
---|
| 1061 | QADD_UNROLL_S(3);
|
---|
| 1062 | QADD_UNROLL_S(4);
|
---|
| 1063 | QADD_UNROLL_S(5);
|
---|
| 1064 | QADD_UNROLL_S(6);
|
---|
| 1065 | QADD_UNROLL_S(7);
|
---|
| 1066 | QADD_UNROLL_S(8);
|
---|
| 1067 | QADD_UNROLL_S(9);
|
---|
| 1068 | QADD_UNROLL_S(10);
|
---|
| 1069 | }
|
---|
| 1070 | }
|
---|
| 1071 | }
|
---|
| 1072 |
|
---|
| 1073 | static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1074 | {
|
---|
| 1075 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1076 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1077 | size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
|
---|
| 1078 |
|
---|
| 1079 | for(oc = 0; oc < channels; oc++)
|
---|
| 1080 | {
|
---|
| 1081 | float sum = 0.f;
|
---|
| 1082 | size_t i;
|
---|
| 1083 | for(i = 0; i < kernel_size; i++)
|
---|
| 1084 | sum += *src++;
|
---|
| 1085 |
|
---|
| 1086 | dest[oc] = sum / kernel_size;
|
---|
| 1087 | }
|
---|
| 1088 | }
|
---|
| 1089 |
|
---|
| 1090 | static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1091 | {
|
---|
| 1092 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1093 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1094 | kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
|
---|
| 1095 | uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
|
---|
| 1096 | uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
|
---|
| 1097 | uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
|
---|
| 1098 |
|
---|
| 1099 | uint32_t out_y, out_x, oc;
|
---|
| 1100 |
|
---|
| 1101 | for(oc = 0; oc < out_shape.channels; oc++)
|
---|
| 1102 | {
|
---|
| 1103 | const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
| 1104 | for(out_y = 0; out_y < out_shape.height; out_y++)
|
---|
| 1105 | {
|
---|
| 1106 | for(out_x = 0; out_x < out_shape.width; out_x++)
|
---|
| 1107 | {
|
---|
| 1108 | int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
|
---|
| 1109 | int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
|
---|
| 1110 | int32_t kernel_x_start = max(0, -in_x_origin);
|
---|
| 1111 | int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
|
---|
| 1112 | int32_t kernel_y_start = max(0, -in_y_origin);
|
---|
| 1113 | int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
|
---|
| 1114 | uint8_t value = 0;
|
---|
| 1115 |
|
---|
| 1116 | int32_t kernel_y, kernel_x;
|
---|
| 1117 | for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
|
---|
| 1118 | {
|
---|
| 1119 | for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
|
---|
| 1120 | {
|
---|
| 1121 | int32_t in_x = in_x_origin + kernel_x;
|
---|
| 1122 | int32_t in_y = in_y_origin + kernel_y;
|
---|
| 1123 | value = max(value, channel_src[in_y * in_shape.width + in_x]);
|
---|
| 1124 | }
|
---|
| 1125 | }
|
---|
| 1126 |
|
---|
| 1127 | *dest++ = value;
|
---|
| 1128 | }
|
---|
| 1129 | }
|
---|
| 1130 | }
|
---|
| 1131 | }
|
---|
| 1132 |
|
---|
| 1133 | static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1134 | {
|
---|
| 1135 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1136 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1137 | kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
|
---|
| 1138 | uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
|
---|
| 1139 | uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
|
---|
| 1140 | uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
|
---|
| 1141 |
|
---|
| 1142 | uint32_t out_y, out_x, oc;
|
---|
| 1143 |
|
---|
| 1144 | for(oc = 0; oc < out_shape.channels; oc++)
|
---|
| 1145 | {
|
---|
| 1146 | const float *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
| 1147 | for(out_y = 0; out_y < out_shape.height; out_y++)
|
---|
| 1148 | {
|
---|
| 1149 | for(out_x = 0; out_x < out_shape.width; out_x++)
|
---|
| 1150 | {
|
---|
| 1151 | int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
|
---|
| 1152 | int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
|
---|
| 1153 | int32_t kernel_x_start = max(0, -in_x_origin);
|
---|
| 1154 | int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
|
---|
| 1155 | int32_t kernel_y_start = max(0, -in_y_origin);
|
---|
| 1156 | int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
|
---|
| 1157 | float value = 0;
|
---|
| 1158 | float kernel_count = 0;
|
---|
| 1159 |
|
---|
| 1160 | int32_t kernel_y, kernel_x;
|
---|
| 1161 | for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
|
---|
| 1162 | {
|
---|
| 1163 | for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
|
---|
| 1164 | {
|
---|
| 1165 | int32_t in_x = in_x_origin + kernel_x;
|
---|
| 1166 | int32_t in_y = in_y_origin + kernel_y;
|
---|
| 1167 | value += channel_src[in_y * in_shape.width + in_x];
|
---|
| 1168 | kernel_count++;
|
---|
| 1169 | }
|
---|
| 1170 | }
|
---|
| 1171 |
|
---|
| 1172 | *dest++ = value / kernel_count;
|
---|
| 1173 | }
|
---|
| 1174 | }
|
---|
| 1175 | }
|
---|
| 1176 | }
|
---|
| 1177 |
|
---|
| 1178 | static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1179 | {
|
---|
| 1180 | size_t count = arg->count;
|
---|
| 1181 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1182 |
|
---|
| 1183 | kpu_model_quant_param_t q = arg->quant_param;
|
---|
| 1184 |
|
---|
| 1185 | float scale = 1.f / q.scale;
|
---|
| 1186 |
|
---|
| 1187 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
|
---|
| 1188 | size_t i;
|
---|
| 1189 | for(i = 0; i < count; i++)
|
---|
| 1190 | {
|
---|
| 1191 | int value = roundf((*src++ - q.bias) * scale);
|
---|
| 1192 | if(value < 0)
|
---|
| 1193 | value = 0;
|
---|
| 1194 | if(value > 0xFF)
|
---|
| 1195 | value = 0xFF;
|
---|
| 1196 | *dest++ = (uint8_t)value;
|
---|
| 1197 | }
|
---|
| 1198 | }
|
---|
| 1199 |
|
---|
| 1200 | static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1201 | {
|
---|
| 1202 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1203 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1204 | size_t oc, count = arg->count;
|
---|
| 1205 | kpu_model_quant_param_t q = arg->quant_param;
|
---|
| 1206 |
|
---|
| 1207 | for(oc = 0; oc < count; oc++)
|
---|
| 1208 | dest[oc] = *src++ * q.scale + q.bias;
|
---|
| 1209 | }
|
---|
| 1210 |
|
---|
| 1211 | static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1212 | {
|
---|
| 1213 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1214 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1215 | size_t oc, i, channels = arg->channels, count = arg->channel_size;
|
---|
| 1216 |
|
---|
| 1217 | for(oc = 0; oc < channels; oc++)
|
---|
| 1218 | {
|
---|
| 1219 | const kpu_model_quant_param_t q = arg->quant_params[oc];
|
---|
| 1220 |
|
---|
| 1221 | for(i = 0; i < count; i++)
|
---|
| 1222 | *dest++ = *src++ * q.scale + q.bias;
|
---|
| 1223 | }
|
---|
| 1224 | }
|
---|
| 1225 |
|
---|
| 1226 | static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1227 | {
|
---|
| 1228 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1229 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1230 | size_t oc, count = arg->count;
|
---|
| 1231 | const uint8_t *table = arg->table;
|
---|
| 1232 |
|
---|
| 1233 | if(false && count % 8 == 0)
|
---|
| 1234 | {
|
---|
| 1235 | for(oc = 0; oc < count;)
|
---|
| 1236 | {
|
---|
| 1237 | dest[oc++] = table[*src++];
|
---|
| 1238 | dest[oc++] = table[*src++];
|
---|
| 1239 | dest[oc++] = table[*src++];
|
---|
| 1240 | dest[oc++] = table[*src++];
|
---|
| 1241 | dest[oc++] = table[*src++];
|
---|
| 1242 | dest[oc++] = table[*src++];
|
---|
| 1243 | dest[oc++] = table[*src++];
|
---|
| 1244 | dest[oc++] = table[*src++];
|
---|
| 1245 | }
|
---|
| 1246 | } else
|
---|
| 1247 | {
|
---|
| 1248 | for(oc = 0; oc < count; oc++)
|
---|
| 1249 | dest[oc] = table[src[oc]];
|
---|
| 1250 | }
|
---|
| 1251 | }
|
---|
| 1252 |
|
---|
| 1253 | static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1254 | {
|
---|
| 1255 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1256 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1257 | size_t oc, channels = arg->channels;
|
---|
| 1258 |
|
---|
| 1259 | float sum = 0.f;
|
---|
| 1260 | const float epsilon = 1e-10f;
|
---|
| 1261 | for(oc = 0; oc < channels; oc++)
|
---|
| 1262 | sum += src[oc] * src[oc];
|
---|
| 1263 | if(sum < epsilon)
|
---|
| 1264 | sum = epsilon;
|
---|
| 1265 | sum = 1.f / sqrtf(sum);
|
---|
| 1266 | for(oc = 0; oc < channels; oc++)
|
---|
| 1267 | dest[oc] = src[oc] * sum;
|
---|
| 1268 | }
|
---|
| 1269 |
|
---|
| 1270 | static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1271 | {
|
---|
| 1272 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1273 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1274 | size_t oc, channels = arg->channels;
|
---|
| 1275 |
|
---|
| 1276 | float max = FLT_MIN;
|
---|
| 1277 | for(oc = 0; oc < channels; oc++)
|
---|
| 1278 | max = fmaxf(max, src[oc]);
|
---|
| 1279 |
|
---|
| 1280 | float sum = 0.f;
|
---|
| 1281 | for(oc = 0; oc < channels; oc++)
|
---|
| 1282 | {
|
---|
| 1283 | float value = expf(src[oc] - max);
|
---|
| 1284 | sum += value;
|
---|
| 1285 | dest[oc] = value;
|
---|
| 1286 | }
|
---|
| 1287 |
|
---|
| 1288 | for(oc = 0; oc < channels; oc++)
|
---|
| 1289 | dest[oc] /= sum;
|
---|
| 1290 | }
|
---|
| 1291 |
|
---|
| 1292 | static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1293 | {
|
---|
| 1294 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1295 | uint32_t count = arg->input_count, i;
|
---|
| 1296 |
|
---|
| 1297 | for(i = 0; i < count; i++)
|
---|
| 1298 | {
|
---|
| 1299 | kpu_model_memory_range_t input = arg->inputs_mem[i];
|
---|
| 1300 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
|
---|
| 1301 | memcpy(dest, src, input.size);
|
---|
| 1302 | dest += input.size;
|
---|
| 1303 | }
|
---|
| 1304 | }
|
---|
| 1305 |
|
---|
| 1306 | static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1307 | {
|
---|
| 1308 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1309 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1310 | uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
|
---|
| 1311 | float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
|
---|
| 1312 | float *bias = (float *)malloc(out_channels * sizeof(float));
|
---|
| 1313 | memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
|
---|
| 1314 | memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
|
---|
| 1315 |
|
---|
| 1316 | if(in_channels % 8 == 0)
|
---|
| 1317 | {
|
---|
| 1318 | #define FC_UNROLL_1(x) \
|
---|
| 1319 | float i##x = *c_src++; \
|
---|
| 1320 | float w##x = *c_weights++;
|
---|
| 1321 |
|
---|
| 1322 | #define FC_UNROLL_2(x) \
|
---|
| 1323 | sum += i##x * w##x;
|
---|
| 1324 |
|
---|
| 1325 | #define FC_UNROLL_S(x) \
|
---|
| 1326 | FC_UNROLL_##x(0) \
|
---|
| 1327 | FC_UNROLL_##x(1) \
|
---|
| 1328 | FC_UNROLL_##x(2) \
|
---|
| 1329 | FC_UNROLL_##x(3) \
|
---|
| 1330 | FC_UNROLL_##x(4) \
|
---|
| 1331 | FC_UNROLL_##x(5) \
|
---|
| 1332 | FC_UNROLL_##x(6) \
|
---|
| 1333 | FC_UNROLL_##x(7)
|
---|
| 1334 |
|
---|
| 1335 | for(oc = 0; oc < out_channels; oc++)
|
---|
| 1336 | {
|
---|
| 1337 | const float *c_src = src;
|
---|
| 1338 | const float *c_weights = weights + oc * in_channels;
|
---|
| 1339 |
|
---|
| 1340 | float sum = 0.0f;
|
---|
| 1341 | for(ic = 0; ic < in_channels / 8; ic++)
|
---|
| 1342 | {
|
---|
| 1343 | FC_UNROLL_S(1);
|
---|
| 1344 | FC_UNROLL_S(2);
|
---|
| 1345 | }
|
---|
| 1346 |
|
---|
| 1347 | dest[oc] = sum + bias[oc];
|
---|
| 1348 | }
|
---|
| 1349 | } else
|
---|
| 1350 | {
|
---|
| 1351 | for(oc = 0; oc < out_channels; oc++)
|
---|
| 1352 | {
|
---|
| 1353 | const float *c_weights = weights + oc * in_channels;
|
---|
| 1354 |
|
---|
| 1355 | float sum = 0.0f;
|
---|
| 1356 | for(ic = 0; ic < in_channels; ic++)
|
---|
| 1357 | sum += src[ic] * c_weights[ic];
|
---|
| 1358 | dest[oc] = sum + bias[oc];
|
---|
| 1359 | }
|
---|
| 1360 | }
|
---|
| 1361 | free(weights);
|
---|
| 1362 | free(bias);
|
---|
| 1363 | kpu_float_activation(dest, out_channels, arg->act);
|
---|
| 1364 | }
|
---|
| 1365 |
|
---|
| 1366 | static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1367 | {
|
---|
| 1368 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1369 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1370 | kpu_model_shape_t in_shape = arg->shape;
|
---|
| 1371 | uint32_t oc, oy, ox;
|
---|
| 1372 |
|
---|
| 1373 | for(oy = 0; oy < in_shape.height; oy++)
|
---|
| 1374 | for(ox = 0; ox < in_shape.width; ox++)
|
---|
| 1375 | for(oc = 0; oc < in_shape.channels; oc++)
|
---|
| 1376 | *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
|
---|
| 1377 | }
|
---|
| 1378 |
|
---|
| 1379 | static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1380 | {
|
---|
| 1381 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1382 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1383 | kpu_model_shape_t in_shape = arg->in_shape;
|
---|
| 1384 | uint32_t out_width = arg->out_width, out_height = arg->out_height;
|
---|
| 1385 | uint32_t oc, oy, ox;
|
---|
| 1386 |
|
---|
| 1387 | float height_scale = (float)in_shape.height / out_height;
|
---|
| 1388 | float width_scale = (float)in_shape.width / out_width;
|
---|
| 1389 |
|
---|
| 1390 | for(oc = 0; oc < in_shape.channels; oc++)
|
---|
| 1391 | {
|
---|
| 1392 | const float *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
| 1393 | for(oy = 0; oy < out_height; oy++)
|
---|
| 1394 | {
|
---|
| 1395 | uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
|
---|
| 1396 | const float *y_origin = channel_src + in_y * in_shape.width;
|
---|
| 1397 | for(ox = 0; ox < out_width; ox++)
|
---|
| 1398 | {
|
---|
| 1399 | uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
|
---|
| 1400 | *dest++ = y_origin[in_x];
|
---|
| 1401 | }
|
---|
| 1402 | }
|
---|
| 1403 | }
|
---|
| 1404 | }
|
---|
| 1405 |
|
---|
| 1406 | static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1407 | {
|
---|
| 1408 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1409 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1410 | kpu_model_shape_t in_shape = arg->in_shape;
|
---|
| 1411 | uint32_t out_width = arg->out_width, out_height = arg->out_height;
|
---|
| 1412 | uint32_t oc, oy, ox;
|
---|
| 1413 |
|
---|
| 1414 | float height_scale = (float)in_shape.height / out_height;
|
---|
| 1415 | float width_scale = (float)in_shape.width / out_width;
|
---|
| 1416 |
|
---|
| 1417 | for(oc = 0; oc < in_shape.channels; oc++)
|
---|
| 1418 | {
|
---|
| 1419 | const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
| 1420 | for(oy = 0; oy < out_height; oy++)
|
---|
| 1421 | {
|
---|
| 1422 | uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
|
---|
| 1423 | const uint8_t *y_origin = channel_src + in_y * in_shape.width;
|
---|
| 1424 | for(ox = 0; ox < out_width; ox++)
|
---|
| 1425 | {
|
---|
| 1426 | uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
|
---|
| 1427 | *dest++ = y_origin[in_x];
|
---|
| 1428 | }
|
---|
| 1429 | }
|
---|
| 1430 | }
|
---|
| 1431 | }
|
---|
| 1432 |
|
---|
| 1433 | static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1434 | {
|
---|
| 1435 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1436 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1437 | size_t oc, channels = arg->channels;
|
---|
| 1438 |
|
---|
| 1439 | for(oc = 0; oc < channels; oc++)
|
---|
| 1440 | dest[oc] = 1.f / (1.f + expf(-src[oc]));
|
---|
| 1441 | }
|
---|
| 1442 |
|
---|
| 1443 | static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1444 | {
|
---|
| 1445 | volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
|
---|
| 1446 | layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
|
---|
| 1447 | layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
|
---|
| 1448 | layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
|
---|
| 1449 |
|
---|
| 1450 | if(arg->flags & KLF_MAIN_MEM_OUT)
|
---|
| 1451 | {
|
---|
| 1452 | dmac_channel_number_t dma_ch = ctx->dma_ch;
|
---|
| 1453 | uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
|
---|
| 1454 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 1455 | .calc_done_int = 1,
|
---|
| 1456 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1457 | .layer_cfg_almost_full_int = 1};
|
---|
| 1458 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 1459 | .calc_done_int = 1,
|
---|
| 1460 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1461 | .layer_cfg_almost_full_int = 1};
|
---|
| 1462 | layer.dma_parameter.data.send_data_out = 1;
|
---|
| 1463 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
| 1464 | if(ctx->current_layer < ctx->layers_length)
|
---|
| 1465 | dmac_set_irq(dma_ch, ai_step, ctx, 1);
|
---|
| 1466 | else
|
---|
| 1467 | dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
|
---|
| 1468 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
| 1469 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
|
---|
| 1470 | } else
|
---|
| 1471 | {
|
---|
| 1472 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 1473 | .calc_done_int = 1,
|
---|
| 1474 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1475 | .layer_cfg_almost_full_int = 1};
|
---|
| 1476 |
|
---|
| 1477 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 1478 | .calc_done_int = 0,
|
---|
| 1479 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1480 | .layer_cfg_almost_full_int = 1};
|
---|
| 1481 | layer.interrupt_enabe.data.int_en = 1;
|
---|
| 1482 | }
|
---|
| 1483 |
|
---|
| 1484 | kpu_send_layer((const kpu_layer_argument_t *)&layer);
|
---|
| 1485 | }
|
---|
| 1486 |
|
---|
| 1487 | static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1488 | {
|
---|
| 1489 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1490 | #if USE_CACHED_AI_RAM
|
---|
| 1491 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
|
---|
| 1492 | #else
|
---|
| 1493 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
|
---|
| 1494 | #endif
|
---|
| 1495 |
|
---|
| 1496 | uint32_t row_padding = 16;
|
---|
| 1497 | uint32_t row_group = 4;
|
---|
| 1498 | uint32_t row_length = 1;
|
---|
| 1499 | uint32_t height = 4;
|
---|
| 1500 | uint32_t oc, x, y, channels = arg->channels;
|
---|
| 1501 |
|
---|
| 1502 | for(oc = 0; oc < channels; oc++)
|
---|
| 1503 | {
|
---|
| 1504 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
| 1505 | for(y = 0; y < 1; y++)
|
---|
| 1506 | {
|
---|
| 1507 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
| 1508 | for(x = 0; x < 1; x++)
|
---|
| 1509 | y_origin[x] = *src++;
|
---|
| 1510 | }
|
---|
| 1511 | }
|
---|
| 1512 |
|
---|
| 1513 | #if USE_CACHED_AI_RAM
|
---|
| 1514 | uint32_t lines = row_length * height * channels / row_group;
|
---|
| 1515 | kpu_flush_cache(arg->kpu_mem_out_address, lines);
|
---|
| 1516 | #endif
|
---|
| 1517 | }
|
---|
| 1518 |
|
---|
| 1519 | static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1520 | {
|
---|
| 1521 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
| 1522 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
| 1523 | uint32_t oc, channels = arg->channels;
|
---|
| 1524 |
|
---|
| 1525 | for(oc = 0; oc < channels; oc++)
|
---|
| 1526 | *dest++ = src[oc * 16];
|
---|
| 1527 | }
|
---|
| 1528 |
|
---|
| 1529 | static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
| 1530 | {
|
---|
| 1531 | size_t width = arg->width;
|
---|
| 1532 | size_t height = arg->height;
|
---|
| 1533 | size_t channels = arg->channels;
|
---|
| 1534 |
|
---|
| 1535 | kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
|
---|
| 1536 | }
|
---|
| 1537 |
|
---|
| 1538 | int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
---|
| 1539 | {
|
---|
| 1540 | #if FIX_CACHE
|
---|
| 1541 | configASSERT(is_memory_cache((uintptr_t)buffer));
|
---|
| 1542 | #endif
|
---|
| 1543 | uintptr_t base_addr = (uintptr_t)buffer;
|
---|
| 1544 | const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
|
---|
| 1545 |
|
---|
| 1546 | if (header->version == 3 && header->arch == 0)
|
---|
| 1547 | {
|
---|
| 1548 | ctx->is_nncase = 0;
|
---|
| 1549 | ctx->model_buffer = buffer;
|
---|
| 1550 | ctx->output_count = header->output_count;
|
---|
| 1551 | ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
|
---|
| 1552 | ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
|
---|
| 1553 | ctx->layers_length = header->layers_length;
|
---|
| 1554 | ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
|
---|
| 1555 | ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
|
---|
| 1556 | if (!ctx->main_buffer)
|
---|
| 1557 | return -1;
|
---|
| 1558 | uint32_t body_size = 0;
|
---|
| 1559 | for (int i=0; i<ctx->layers_length; i++)
|
---|
| 1560 | {
|
---|
| 1561 | const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
|
---|
| 1562 | body_size += cnt_layer_header->body_size;
|
---|
| 1563 | }
|
---|
| 1564 | uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
|
---|
| 1565 | const uint8_t *body_start_cache = ctx->body_start;
|
---|
| 1566 | memcpy(body_start_iomem, body_start_cache, body_size);
|
---|
| 1567 | for (int i=0; i<body_size; i++)
|
---|
| 1568 | {
|
---|
| 1569 | configASSERT(body_start_iomem[i] == body_start_cache[i]);
|
---|
| 1570 | }
|
---|
| 1571 |
|
---|
| 1572 | } else
|
---|
| 1573 | {
|
---|
| 1574 | return -1;
|
---|
| 1575 | }
|
---|
| 1576 |
|
---|
| 1577 | return 0;
|
---|
| 1578 | }
|
---|
| 1579 |
|
---|
| 1580 | int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
|
---|
| 1581 | {
|
---|
| 1582 | if(ctx->is_nncase)
|
---|
| 1583 | return -1;
|
---|
| 1584 |
|
---|
| 1585 | if(index >= ctx->output_count)
|
---|
| 1586 | return -1;
|
---|
| 1587 |
|
---|
| 1588 | const kpu_model_output_t *output = ctx->outputs + index;
|
---|
| 1589 | *data = ctx->main_buffer + output->address;
|
---|
| 1590 | *size = output->size;
|
---|
| 1591 | return 0;
|
---|
| 1592 | }
|
---|
| 1593 |
|
---|
| 1594 | void kpu_model_free(kpu_model_context_t *ctx)
|
---|
| 1595 | {
|
---|
| 1596 | if(ctx->is_nncase)
|
---|
| 1597 | return;
|
---|
| 1598 |
|
---|
| 1599 | free(ctx->main_buffer);
|
---|
| 1600 | ctx->main_buffer = NULL;
|
---|
| 1601 | }
|
---|
| 1602 |
|
---|
| 1603 | #if KPU_DEBUG
|
---|
| 1604 | static uint64_t last_time;
|
---|
| 1605 | static uint64_t total_time;
|
---|
| 1606 | static uint64_t kpu_time;
|
---|
| 1607 | static uint32_t last_layer_type;
|
---|
| 1608 |
|
---|
| 1609 | static const char *str_layer_type(uint32_t type)
|
---|
| 1610 | {
|
---|
| 1611 | switch(type)
|
---|
| 1612 | {
|
---|
| 1613 | case KL_ADD:
|
---|
| 1614 | return "Add";
|
---|
| 1615 | case KL_QUANTIZED_ADD:
|
---|
| 1616 | return "QuantAdd";
|
---|
| 1617 | case KL_GLOBAL_AVERAGE_POOL2D:
|
---|
| 1618 | return "GAP";
|
---|
| 1619 | case KL_QUANTIZED_MAX_POOL2D:
|
---|
| 1620 | return "QuantMaxPool2d";
|
---|
| 1621 | case KL_AVERAGE_POOL2D:
|
---|
| 1622 | return "AveragePool2d";
|
---|
| 1623 | case KL_QUANTIZE:
|
---|
| 1624 | return "Quantize";
|
---|
| 1625 | case KL_DEQUANTIZE:
|
---|
| 1626 | return "Dequantize";
|
---|
| 1627 | case KL_REQUANTIZE:
|
---|
| 1628 | return "Requantize";
|
---|
| 1629 | case KL_L2_NORMALIZATION:
|
---|
| 1630 | return "L2Norm";
|
---|
| 1631 | case KL_SOFTMAX:
|
---|
| 1632 | return "Softmax";
|
---|
| 1633 | case KL_CONCAT:
|
---|
| 1634 | return "Concat";
|
---|
| 1635 | case KL_QUANTIZED_CONCAT:
|
---|
| 1636 | return "QuantConcat";
|
---|
| 1637 | case KL_FULLY_CONNECTED:
|
---|
| 1638 | return "FullyConnected";
|
---|
| 1639 | case KL_TENSORFLOW_FLATTEN:
|
---|
| 1640 | return "TFFlatten";
|
---|
| 1641 | case KL_RESIZE_NEAREST_NEIGHBOR:
|
---|
| 1642 | return "ResizeNearestNeighbor";
|
---|
| 1643 | case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
|
---|
| 1644 | return "QuantResizeNearestNeighbor";
|
---|
| 1645 | case KL_CHANNELWISE_DEQUANTIZE:
|
---|
| 1646 | return "ChannelwiseDequantize";
|
---|
| 1647 | case KL_LOGISTIC:
|
---|
| 1648 | return "Logistic";
|
---|
| 1649 | case KL_K210_CONV:
|
---|
| 1650 | return "K210Conv";
|
---|
| 1651 | case KL_K210_ADD_PADDING:
|
---|
| 1652 | return "K210AddPad";
|
---|
| 1653 | case KL_K210_REMOVE_PADDING:
|
---|
| 1654 | return "K210RemovePad";
|
---|
| 1655 | case KL_K210_UPLOAD:
|
---|
| 1656 | return "K210Upload";
|
---|
| 1657 | default:
|
---|
| 1658 | return "Unknown";
|
---|
| 1659 | }
|
---|
| 1660 | }
|
---|
| 1661 | #endif
|
---|
| 1662 |
|
---|
| 1663 | static int kpu_kmodel_done(kpu_model_context_t *ctx)
|
---|
| 1664 | {
|
---|
| 1665 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
| 1666 | .calc_done_int = 1,
|
---|
| 1667 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1668 | .layer_cfg_almost_full_int = 1};
|
---|
| 1669 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 1670 | .calc_done_int = 1,
|
---|
| 1671 | .layer_cfg_almost_empty_int = 1,
|
---|
| 1672 | .layer_cfg_almost_full_int = 1};
|
---|
| 1673 | #if KPU_DEBUG
|
---|
| 1674 | uint32_t cnt_layer_id = ctx->current_layer - 1;
|
---|
| 1675 | uint64_t time = sysctl_get_time_us();
|
---|
| 1676 | if(last_time != 0)
|
---|
| 1677 | {
|
---|
| 1678 | uint64_t layer_time = time - last_time;
|
---|
| 1679 | syslog(LOG_NOTICE, "layer %d [%s]: %f ms", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);
|
---|
| 1680 | total_time += layer_time;
|
---|
| 1681 | if(last_layer_type == KL_K210_CONV)
|
---|
| 1682 | kpu_time += layer_time;
|
---|
| 1683 | }
|
---|
| 1684 |
|
---|
| 1685 | syslog(LOG_NOTICE, "KPU: %f ms", kpu_time / 1000.0);
|
---|
| 1686 | syslog(LOG_NOTICE, "CPU: %f ms", (total_time - kpu_time) / 1000.0);
|
---|
| 1687 | syslog(LOG_NOTICE, "Model: %f ms", total_time / 1000.0);
|
---|
| 1688 | #endif
|
---|
| 1689 | ctx->done_callback(ctx->userdata);
|
---|
| 1690 | return 0;
|
---|
| 1691 | }
|
---|
| 1692 |
|
---|
| 1693 | static int ai_step(void *userdata)
|
---|
| 1694 | {
|
---|
| 1695 | kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
|
---|
| 1696 |
|
---|
| 1697 | uint32_t cnt_layer_id = ctx->current_layer;
|
---|
| 1698 | const uint8_t *layer_body = ctx->current_body;
|
---|
| 1699 | const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
|
---|
| 1700 | if (cnt_layer_id >= ctx->layers_length) {
|
---|
| 1701 | //syslog(LOG_NOTICE, "overrun");
|
---|
| 1702 | kpu_kmodel_done(ctx);
|
---|
| 1703 | return -1;
|
---|
| 1704 | }
|
---|
| 1705 |
|
---|
| 1706 | ctx->current_layer++;
|
---|
| 1707 | ctx->current_body += cnt_layer_header->body_size;
|
---|
| 1708 |
|
---|
| 1709 | #if KPU_DEBUG
|
---|
| 1710 | uint64_t time = sysctl_get_time_us();
|
---|
| 1711 | if(last_time != 0)
|
---|
| 1712 | {
|
---|
| 1713 | uint64_t layer_time = time - last_time;
|
---|
| 1714 | syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
|
---|
| 1715 | total_time += layer_time;
|
---|
| 1716 | if(last_layer_type == KL_K210_CONV)
|
---|
| 1717 | kpu_time += layer_time;
|
---|
| 1718 | }
|
---|
| 1719 |
|
---|
| 1720 | last_layer_type = cnt_layer_header->type;
|
---|
| 1721 | last_time = sysctl_get_time_us();
|
---|
| 1722 | #endif
|
---|
| 1723 |
|
---|
| 1724 | switch(cnt_layer_header->type)
|
---|
| 1725 | {
|
---|
| 1726 | case KL_ADD:
|
---|
| 1727 | kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
|
---|
| 1728 | break;
|
---|
| 1729 | case KL_QUANTIZED_ADD:
|
---|
| 1730 | kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
|
---|
| 1731 | break;
|
---|
| 1732 | case KL_GLOBAL_AVERAGE_POOL2D:
|
---|
| 1733 | kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
|
---|
| 1734 | break;
|
---|
| 1735 | case KL_QUANTIZED_MAX_POOL2D:
|
---|
| 1736 | kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
|
---|
| 1737 | break;
|
---|
| 1738 | case KL_AVERAGE_POOL2D:
|
---|
| 1739 | kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
|
---|
| 1740 | break;
|
---|
| 1741 | case KL_QUANTIZE:
|
---|
| 1742 | kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
|
---|
| 1743 | break;
|
---|
| 1744 | case KL_DEQUANTIZE:
|
---|
| 1745 | kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
|
---|
| 1746 | break;
|
---|
| 1747 | case KL_REQUANTIZE:
|
---|
| 1748 | kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
|
---|
| 1749 | break;
|
---|
| 1750 | case KL_L2_NORMALIZATION:
|
---|
| 1751 | kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
|
---|
| 1752 | break;
|
---|
| 1753 | case KL_SOFTMAX:
|
---|
| 1754 | kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
|
---|
| 1755 | break;
|
---|
| 1756 | case KL_CONCAT:
|
---|
| 1757 | case KL_QUANTIZED_CONCAT:
|
---|
| 1758 | kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
|
---|
| 1759 | break;
|
---|
| 1760 | case KL_FULLY_CONNECTED:
|
---|
| 1761 | kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
|
---|
| 1762 | break;
|
---|
| 1763 | case KL_TENSORFLOW_FLATTEN:
|
---|
| 1764 | kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
|
---|
| 1765 | break;
|
---|
| 1766 | case KL_RESIZE_NEAREST_NEIGHBOR:
|
---|
| 1767 | kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
|
---|
| 1768 | break;
|
---|
| 1769 | case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
|
---|
| 1770 | kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
|
---|
| 1771 | break;
|
---|
| 1772 | case KL_CHANNELWISE_DEQUANTIZE:
|
---|
| 1773 | kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
|
---|
| 1774 | break;
|
---|
| 1775 | case KL_LOGISTIC:
|
---|
| 1776 | kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
|
---|
| 1777 | break;
|
---|
| 1778 | case KL_K210_CONV:
|
---|
| 1779 | kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
|
---|
| 1780 | return 0;
|
---|
| 1781 | case KL_K210_ADD_PADDING:
|
---|
| 1782 | kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
|
---|
| 1783 | break;
|
---|
| 1784 | case KL_K210_REMOVE_PADDING:
|
---|
| 1785 | kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
|
---|
| 1786 | break;
|
---|
| 1787 | case KL_K210_UPLOAD:
|
---|
| 1788 | kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
|
---|
| 1789 | break;
|
---|
| 1790 | default:
|
---|
| 1791 | assert(!"Layer is not supported.");
|
---|
| 1792 | kpu_kmodel_done(ctx);
|
---|
| 1793 | return -1;
|
---|
| 1794 | }
|
---|
| 1795 |
|
---|
| 1796 | if (ctx->current_layer < ctx->layers_length)
|
---|
| 1797 | ai_step(userdata);
|
---|
| 1798 | else
|
---|
| 1799 | kpu_kmodel_done(ctx);
|
---|
| 1800 | return 0;
|
---|
| 1801 | }
|
---|
| 1802 |
|
---|
| 1803 | static void ai_step_not_isr(void *userdata)
|
---|
| 1804 | {
|
---|
| 1805 | sysctl_disable_irq();
|
---|
| 1806 | ai_step(userdata);
|
---|
| 1807 | sysctl_enable_irq();
|
---|
| 1808 | }
|
---|
| 1809 |
|
---|
| 1810 | int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
|
---|
| 1811 | {
|
---|
| 1812 | if(ctx->is_nncase)
|
---|
| 1813 | return -1;
|
---|
| 1814 |
|
---|
| 1815 | ctx->dma_ch = dma_ch;
|
---|
| 1816 | ctx->done_callback = done_callback;
|
---|
| 1817 | ctx->userdata = userdata;
|
---|
| 1818 | ctx->current_layer = 0;
|
---|
| 1819 | ctx->current_body = ctx->body_start;
|
---|
| 1820 | #if KPU_DEBUG
|
---|
| 1821 | last_time = 0;
|
---|
| 1822 | total_time = 0;
|
---|
| 1823 | kpu_time = 0;
|
---|
| 1824 | #endif
|
---|
| 1825 |
|
---|
| 1826 | kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
|
---|
| 1827 | kpu->interrupt_clear.reg = 7;
|
---|
| 1828 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
| 1829 | .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
|
---|
| 1830 | kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
|
---|
| 1831 | .eight_bit_mode = header->flags & 1};
|
---|
| 1832 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
| 1833 | .calc_done_int = 1,
|
---|
| 1834 | .layer_cfg_almost_empty_int = 0,
|
---|
| 1835 | .layer_cfg_almost_full_int = 1};
|
---|
| 1836 |
|
---|
| 1837 | plic_set_priority(INTNO_AI, 1);
|
---|
| 1838 | plic_irq_register(INTNO_AI, ai_step, ctx);
|
---|
| 1839 | plic_irq_enable(INTNO_AI);
|
---|
| 1840 |
|
---|
| 1841 | const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
|
---|
| 1842 |
|
---|
| 1843 | switch(first_layer_header->type)
|
---|
| 1844 | {
|
---|
| 1845 | case KL_K210_CONV:
|
---|
| 1846 | {
|
---|
| 1847 | const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
|
---|
| 1848 | kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
|
---|
| 1849 |
|
---|
| 1850 | if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
|
---|
| 1851 | {
|
---|
| 1852 | kpu_kmodel_input_with_padding(&layer_arg, src);
|
---|
| 1853 | ai_step_not_isr(ctx);
|
---|
| 1854 | } else
|
---|
| 1855 | {
|
---|
| 1856 | kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
|
---|
| 1857 | }
|
---|
| 1858 | }
|
---|
| 1859 | break;
|
---|
| 1860 | case KL_FULLY_CONNECTED:
|
---|
| 1861 | {
|
---|
| 1862 | const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
|
---|
| 1863 | kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
|
---|
| 1864 | ai_step_not_isr(ctx);
|
---|
| 1865 | }
|
---|
| 1866 | break;
|
---|
| 1867 | default:
|
---|
| 1868 | return -1;
|
---|
| 1869 | }
|
---|
| 1870 |
|
---|
| 1871 | return 0;
|
---|
| 1872 | }
|
---|