1 | #include <assert.h>
|
---|
2 | #include <float.h>
|
---|
3 | #include <math.h>
|
---|
4 | #include <stdio.h>
|
---|
5 | #include <stdlib.h>
|
---|
6 | #include <string.h>
|
---|
7 | #include <stdint.h>
|
---|
8 | #include <kernel.h>
|
---|
9 | #include <t_syslog.h>
|
---|
10 | #include <t_stdlib.h>
|
---|
11 | #include <kernel_impl.h>
|
---|
12 | #include <target_syssvc.h>
|
---|
13 | #include "kendryte-k210.h"
|
---|
14 | #include "device.h"
|
---|
15 | #include "atomic.h"
|
---|
16 | #include "kpu.h"
|
---|
17 | #include "utils.h"
|
---|
18 | #include "kpu_main.h"
|
---|
19 |
|
---|
20 | #define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b))
|
---|
21 |
|
---|
22 | void sysctl_enable_irq(void)
|
---|
23 | {
|
---|
24 | set_csr(mie, MIP_MEIP);
|
---|
25 | set_csr(mstatus, MSTATUS_MIE);
|
---|
26 | }
|
---|
27 |
|
---|
28 | void sysctl_disable_irq(void)
|
---|
29 | {
|
---|
30 | clear_csr(mie, MIP_MEIP);
|
---|
31 | clear_csr(mstatus, MSTATUS_MIE);
|
---|
32 | }
|
---|
33 |
|
---|
34 | uint64_t sysctl_get_time_us(void)
|
---|
35 | {
|
---|
36 | uint64_t v_cycle = read_cycle();
|
---|
37 | return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
|
---|
38 | }
|
---|
39 |
|
---|
40 | static int is_memory(uintptr_t address)
|
---|
41 | {
|
---|
42 | enum
|
---|
43 | {
|
---|
44 | mem_len = 6 * 1024 * 1024,
|
---|
45 | mem_no_cache_len = 8 * 1024 * 1024,
|
---|
46 | };
|
---|
47 | return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040);
|
---|
48 | }
|
---|
49 |
|
---|
50 | uint32_t is_memory_cache(uintptr_t address)
|
---|
51 | {
|
---|
52 | #define MEM_CACHE_LEN (6 * 1024 * 1024)
|
---|
53 |
|
---|
54 | return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
|
---|
55 | }
|
---|
56 |
|
---|
57 | int plic_irq_enable(INTNO irq_number)
|
---|
58 | {
|
---|
59 | if (irq_number != INTNO_AI)
|
---|
60 | return -1;
|
---|
61 | ena_int(irq_number);
|
---|
62 | return 0;
|
---|
63 | }
|
---|
64 |
|
---|
65 | int plic_set_priority(INTNO irq_number, uint32_t priority)
|
---|
66 | {
|
---|
67 | if (irq_number != INTNO_AI)
|
---|
68 | return -1;
|
---|
69 | set_ipriority(irq_number, priority);
|
---|
70 | return 0;
|
---|
71 | }
|
---|
72 |
|
---|
73 | plic_irq_callback_t ai_done_callback;
|
---|
74 | void *ai_done_ctx;
|
---|
75 |
|
---|
76 | void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
|
---|
77 | {
|
---|
78 | ER ret;
|
---|
79 | if (irq != INTNO_AI)
|
---|
80 | return;
|
---|
81 |
|
---|
82 | ret = loc_cpu();
|
---|
83 |
|
---|
84 | ai_done_callback = callback;
|
---|
85 | ai_done_ctx = ctx;
|
---|
86 |
|
---|
87 | if (ret == E_OK)
|
---|
88 | unl_cpu();
|
---|
89 | }
|
---|
90 |
|
---|
91 | void ai_done_isr(intptr_t exinf)
|
---|
92 | {
|
---|
93 | sysctl_disable_irq();
|
---|
94 | if (ai_done_callback != NULL){
|
---|
95 | ai_done_callback(ai_done_ctx);
|
---|
96 | }
|
---|
97 | sysctl_enable_irq();
|
---|
98 | }
|
---|
99 |
|
---|
100 | plic_irq_callback_t ai_dma_done_callback;
|
---|
101 | void *ai_dma_done_ctx;
|
---|
102 |
|
---|
103 | void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
|
---|
104 | plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
|
---|
105 | {
|
---|
106 | ER ret;
|
---|
107 | if (channel_num != AI_DMA_CH)
|
---|
108 | return;
|
---|
109 |
|
---|
110 | set_ipriority(INTNO_DMAAI, priority);
|
---|
111 |
|
---|
112 | ret = loc_cpu();
|
---|
113 |
|
---|
114 | ai_dma_done_callback = dmac_callback;
|
---|
115 | ai_dma_done_ctx = ctx;
|
---|
116 |
|
---|
117 | if (ret == E_OK)
|
---|
118 | unl_cpu();
|
---|
119 | }
|
---|
120 |
|
---|
121 | void ai_dma_done_isr(DMA_Handle_t *dma)
|
---|
122 | {
|
---|
123 | sysctl_disable_irq();
|
---|
124 | if (ai_dma_done_callback != NULL) {
|
---|
125 | ai_dma_done_callback(ai_dma_done_ctx);
|
---|
126 | }
|
---|
127 | sysctl_enable_irq();
|
---|
128 | }
|
---|
129 |
|
---|
130 | void dmac_set_irq(dmac_channel_number_t channel_num,
|
---|
131 | plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
|
---|
132 | {
|
---|
133 | ER ret;
|
---|
134 | if (channel_num != AI_DMA_CH)
|
---|
135 | return;
|
---|
136 |
|
---|
137 | set_ipriority(INTNO_DMAAI, priority);
|
---|
138 |
|
---|
139 | ret = loc_cpu();
|
---|
140 |
|
---|
141 | ai_dma_done_callback = dmac_callback;
|
---|
142 | ai_dma_done_ctx = ctx;
|
---|
143 |
|
---|
144 | if (ret == E_OK)
|
---|
145 | unl_cpu();
|
---|
146 | }
|
---|
147 |
|
---|
148 | DMA_Handle_t g_ai_hdma;
|
---|
149 |
|
---|
150 | void dmac_set_single_mode(dmac_channel_number_t channel_num,
|
---|
151 | const void *src, void *dest, uint8_t src_inc,
|
---|
152 | uint8_t dest_inc,
|
---|
153 | uint8_t dmac_burst_size,
|
---|
154 | uint8_t dmac_trans_width,
|
---|
155 | size_t block_size)
|
---|
156 | {
|
---|
157 | if (channel_num != AI_DMA_CH)
|
---|
158 | return;
|
---|
159 |
|
---|
160 | DMA_Handle_t *hdma = &g_ai_hdma;
|
---|
161 | int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
|
---|
162 | uint8_t flow_control;
|
---|
163 | if(mem_type_src == 0 && mem_type_dest == 0)
|
---|
164 | flow_control = DMA_PERIPH_TO_PERIPH;
|
---|
165 | else if(mem_type_src == 1 && mem_type_dest == 0)
|
---|
166 | flow_control = DMA_MEMORY_TO_PERIPH;
|
---|
167 | else if(mem_type_src == 0 && mem_type_dest == 1)
|
---|
168 | flow_control = DMA_PERIPH_TO_MEMORY;
|
---|
169 | else
|
---|
170 | flow_control = DMA_MEMORY_TO_MEMORY;
|
---|
171 |
|
---|
172 | hdma->Init.Direction = flow_control; /* DMA転送方向 */
|
---|
173 | hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
|
---|
174 | hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
|
---|
175 | hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
|
---|
176 | hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
|
---|
177 | hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
|
---|
178 | hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
|
---|
179 | hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
|
---|
180 | hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
|
---|
181 | dma_reset(hdma);
|
---|
182 |
|
---|
183 | dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
|
---|
184 | }
|
---|
185 |
|
---|
186 | #define LAYER_BURST_SIZE 12
|
---|
187 |
|
---|
188 | #define KPU_DEBUG 0
|
---|
189 | #define USE_CACHED_AI_RAM 0
|
---|
190 |
|
---|
191 | #define min(a, b) (((a) < (b)) ? (a) : (b))
|
---|
192 | #define max(a, b) (((a) > (b)) ? (a) : (b))
|
---|
193 | #define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
|
---|
194 |
|
---|
195 | static int ai_step(void *userdata);
|
---|
196 | static int kpu_kmodel_done(kpu_model_context_t *ctx);
|
---|
197 |
|
---|
198 | volatile kpu_config_t *const kpu = (volatile kpu_config_t *)AI_BASE_ADDR;
|
---|
199 | static volatile uint32_t kpu_status;
|
---|
200 |
|
---|
201 | typedef struct kpu_context
|
---|
202 | {
|
---|
203 | kpu_task_t kpu_task;
|
---|
204 | uint32_t kpu_status;
|
---|
205 | } kpu_context_t;
|
---|
206 |
|
---|
207 | volatile kpu_context_t g_kpu_context;
|
---|
208 |
|
---|
209 | static int kpu_run_all_done(void *_task)
|
---|
210 | {
|
---|
211 | atomic_swap(&g_kpu_context.kpu_status, 0);
|
---|
212 | kpu_task_t *task = (kpu_task_t *)_task;
|
---|
213 | task->callback(task);
|
---|
214 | return 0;
|
---|
215 | }
|
---|
216 |
|
---|
217 | int kpu_continue(void *_task)
|
---|
218 | {
|
---|
219 | kpu_task_t *task = (kpu_task_t *)_task;
|
---|
220 | int layer_burst_size = 1;
|
---|
221 |
|
---|
222 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
223 | .calc_done_int = 1,
|
---|
224 | .layer_cfg_almost_empty_int = 1,
|
---|
225 | .layer_cfg_almost_full_int = 1};
|
---|
226 |
|
---|
227 | if(task->remain_layers_length == 0)
|
---|
228 | {
|
---|
229 | return 0;
|
---|
230 | }
|
---|
231 | if(task->remain_layers_length <= layer_burst_size)
|
---|
232 | {
|
---|
233 | for(uint32_t i = 0; i < task->remain_layers_length; i++)
|
---|
234 | {
|
---|
235 | kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
---|
236 | kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
---|
237 | kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
|
---|
238 | kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
|
---|
239 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
|
---|
240 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
|
---|
241 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
|
---|
242 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
|
---|
243 | kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
|
---|
244 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
|
---|
245 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
|
---|
246 | kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
|
---|
247 | }
|
---|
248 | task->remain_layers_length = 0;
|
---|
249 | } else
|
---|
250 | {
|
---|
251 | for(uint32_t i = 0; i < layer_burst_size; i++)
|
---|
252 | {
|
---|
253 | kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
---|
254 | kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
---|
255 | kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
|
---|
256 | kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
|
---|
257 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
|
---|
258 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
|
---|
259 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
|
---|
260 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
|
---|
261 | kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
|
---|
262 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
|
---|
263 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
|
---|
264 | kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
|
---|
265 | }
|
---|
266 | task->remain_layers += layer_burst_size;
|
---|
267 | task->remain_layers_length -= layer_burst_size;
|
---|
268 | }
|
---|
269 | return 0;
|
---|
270 | }
|
---|
271 |
|
---|
272 | static int kpu_run_dma_output(uint32_t dma_ch, void *dst, uint32_t length, plic_irq_callback_t cb, void *_task)
|
---|
273 | {
|
---|
274 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
275 | kpu_dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);
|
---|
276 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
277 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
|
---|
278 | return 0;
|
---|
279 | }
|
---|
280 |
|
---|
281 | static int kpu_run_dma_input_done_push_layers(void *_task)
|
---|
282 | {
|
---|
283 | kpu_task_t *task = (kpu_task_t *)_task;
|
---|
284 | kpu->interrupt_clear.reg = 7;
|
---|
285 | dma_end(&g_ai_hdma);
|
---|
286 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
287 | .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
|
---|
288 | kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
|
---|
289 | .eight_bit_mode = task->eight_bit_mode};
|
---|
290 |
|
---|
291 | kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
|
---|
292 |
|
---|
293 | kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
|
---|
294 |
|
---|
295 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
296 | .calc_done_int = 0,
|
---|
297 | .layer_cfg_almost_empty_int = 0,
|
---|
298 | .layer_cfg_almost_full_int = 1};
|
---|
299 | kpu_continue(task);
|
---|
300 | return 0;
|
---|
301 | }
|
---|
302 |
|
---|
303 | static void kpu_run_dma_input(uint32_t dma_ch, const void *src, plic_irq_callback_t cb, void *_task)
|
---|
304 | {
|
---|
305 | kpu_task_t *task = _task;
|
---|
306 | kpu_layer_argument_t *first_layer = &task->layers[0];
|
---|
307 | uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
|
---|
308 | kpu_dmac_irq_register(dma_ch, cb, _task, 1);
|
---|
309 | dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
---|
310 | DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
|
---|
311 | }
|
---|
312 |
|
---|
313 | int kpu_run(kpu_task_t *v_task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback)
|
---|
314 | {
|
---|
315 | if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
|
---|
316 | return -1;
|
---|
317 |
|
---|
318 | memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
|
---|
319 | kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;
|
---|
320 |
|
---|
321 | kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
|
---|
322 |
|
---|
323 | uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
|
---|
324 |
|
---|
325 | last_layer->dma_parameter.data.send_data_out = 1;
|
---|
326 | last_layer->interrupt_enabe.data.int_en = 1;
|
---|
327 |
|
---|
328 | task->dma_ch = dma_ch;
|
---|
329 | task->dst = dest;
|
---|
330 | task->dst_length = output_size;
|
---|
331 | task->callback = callback;
|
---|
332 | task->remain_layers_length = task->layers_length;
|
---|
333 | task->remain_layers = task->layers;
|
---|
334 |
|
---|
335 | plic_set_priority(INTNO_AI, 1);
|
---|
336 | plic_irq_register(INTNO_AI, kpu_continue, task);
|
---|
337 | plic_irq_enable(INTNO_AI);
|
---|
338 |
|
---|
339 | kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);
|
---|
340 |
|
---|
341 | return 0;
|
---|
342 | }
|
---|
343 |
|
---|
344 | uint8_t *kpu_get_output_buf(kpu_task_t *task)
|
---|
345 | {
|
---|
346 | kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
|
---|
347 | size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
|
---|
348 | return malloc(output_size);
|
---|
349 | }
|
---|
350 |
|
---|
351 | void kpu_release_output_buf(uint8_t *output_buf)
|
---|
352 | {
|
---|
353 | if(output_buf != NULL)
|
---|
354 | free(output_buf);
|
---|
355 | }
|
---|
356 |
|
---|
357 | static int kpu_done(void *ctx)
|
---|
358 | {
|
---|
359 | atomic_swap(&kpu_status, 0);
|
---|
360 | kpu_task_t *task = (kpu_task_t *)ctx;
|
---|
361 | task->callback(task->ctx);
|
---|
362 | return 0;
|
---|
363 | }
|
---|
364 |
|
---|
365 | static int kpu_config_input(void *ctx)
|
---|
366 | {
|
---|
367 | kpu_task_t *task = (kpu_task_t *)ctx;
|
---|
368 | kpu->interrupt_clear.reg = 7;
|
---|
369 | if(task->remain_layers_length <= LAYER_BURST_SIZE)
|
---|
370 | {
|
---|
371 | for(uint32_t i = 0; i < task->remain_layers_length; i++)
|
---|
372 | {
|
---|
373 | kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
---|
374 | kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
---|
375 | kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
|
---|
376 | kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
|
---|
377 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
|
---|
378 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
|
---|
379 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
|
---|
380 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
|
---|
381 | kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
|
---|
382 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
|
---|
383 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
|
---|
384 | kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
|
---|
385 | }
|
---|
386 | task->remain_layers_length = 0;
|
---|
387 | kpu->interrupt_mask.reg = 7;
|
---|
388 | } else
|
---|
389 | {
|
---|
390 | for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)
|
---|
391 | {
|
---|
392 | kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
|
---|
393 | kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
|
---|
394 | kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
|
---|
395 | kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
|
---|
396 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
|
---|
397 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
|
---|
398 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
|
---|
399 | kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
|
---|
400 | kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
|
---|
401 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
|
---|
402 | kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
|
---|
403 | kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
|
---|
404 | }
|
---|
405 | task->remain_layers += LAYER_BURST_SIZE;
|
---|
406 | task->remain_layers_length -= LAYER_BURST_SIZE;
|
---|
407 | }
|
---|
408 | return 0;
|
---|
409 | }
|
---|
410 |
|
---|
411 | static void kpu_data_output(kpu_task_t *task)
|
---|
412 | {
|
---|
413 | select_dma_channel(task->dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
414 | kpu_dmac_irq_register(task->dma_ch, kpu_done, task, 1);
|
---|
415 | dmac_set_single_mode(task->dma_ch, (void *)(&kpu->fifo_data_out), (void *)(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
416 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);
|
---|
417 | }
|
---|
418 |
|
---|
419 | static int kpu_data_ready(void *ctx)
|
---|
420 | {
|
---|
421 | kpu_task_t *task = (kpu_task_t *)ctx;
|
---|
422 |
|
---|
423 | dma_end(&g_ai_hdma);
|
---|
424 | kpu_data_output(task);
|
---|
425 |
|
---|
426 | kpu->eight_bit_mode.reg = task->eight_bit_mode;
|
---|
427 | kpu->interrupt_mask.reg = 7;
|
---|
428 | kpu->interrupt_clear.reg = 7;
|
---|
429 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
430 | .fifo_full_threshold = 12, .fifo_empty_threshold = 1};
|
---|
431 |
|
---|
432 | plic_set_priority(INTNO_AI, 2);
|
---|
433 | plic_irq_register(INTNO_AI, kpu_config_input, task);
|
---|
434 | plic_irq_enable(INTNO_AI);
|
---|
435 | kpu_config_input(task);
|
---|
436 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
437 | .calc_done_int = 1,
|
---|
438 | .layer_cfg_almost_empty_int = 0,
|
---|
439 | .layer_cfg_almost_full_int = 1};
|
---|
440 | return 0;
|
---|
441 | }
|
---|
442 |
|
---|
443 | static void kpu_data_input(kpu_task_t *task)
|
---|
444 | {
|
---|
445 | if(task->src == NULL)
|
---|
446 | {
|
---|
447 | kpu_data_ready(task);
|
---|
448 | return;
|
---|
449 | }
|
---|
450 | kpu_dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);
|
---|
451 | kpu_layer_argument_t *layer = &task->layers[0];
|
---|
452 | dmac_set_single_mode(task->dma_ch, (void *)(uintptr_t)task->src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
---|
453 | DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);
|
---|
454 | }
|
---|
455 |
|
---|
456 | int kpu_single_task_init(kpu_task_t *task)
|
---|
457 | {
|
---|
458 | /*
|
---|
459 | * AIクロック有効化
|
---|
460 | */
|
---|
461 | sil_orw_mem((uint32_t *)(TADR_SYSCTL_BASE+TOFF_SYSCTL_CLK_EN_PERI), SYSCTL_CLK_EN_PERI_AI_CLK_EN);
|
---|
462 |
|
---|
463 | kpu_layer_argument_t *first_layer = &task->layers[0];
|
---|
464 | kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
|
---|
465 |
|
---|
466 | last_layer->dma_parameter.data.send_data_out = 1;
|
---|
467 | last_layer->interrupt_enabe.data.int_en = 1;
|
---|
468 | task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;
|
---|
469 | task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;
|
---|
470 | task->dst = (uint64_t *)malloc(task->dst_length * 8);
|
---|
471 | if(task->dst == NULL)
|
---|
472 | return 1;
|
---|
473 | memset(task->dst, 0, task->dst_length * 8);
|
---|
474 | return 0;
|
---|
475 | }
|
---|
476 |
|
---|
477 | int kpu_single_task_deinit(kpu_task_t *task)
|
---|
478 | {
|
---|
479 | free(task->dst);
|
---|
480 | return 0;
|
---|
481 | }
|
---|
482 |
|
---|
483 | int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta)
|
---|
484 | {
|
---|
485 | uintptr_t base_addr = (uintptr_t)buffer;
|
---|
486 | kpu_model_header_t *header = (kpu_model_header_t *)buffer;
|
---|
487 | kpu_model_layer_metadata_t *layer_meta = (kpu_model_layer_metadata_t *)(base_addr + sizeof(kpu_model_header_t));
|
---|
488 | kpu_layer_argument_t *layers = (kpu_layer_argument_t *)(base_addr + header->layers_argument_start);
|
---|
489 |
|
---|
490 | if(header->version != 1)
|
---|
491 | return -1;
|
---|
492 | uint32_t layers_length = header->layers_length;
|
---|
493 | task->layers_length = layers_length;
|
---|
494 | task->eight_bit_mode = header->flags & 1;
|
---|
495 | task->layers = layers;
|
---|
496 | task->output_scale = layer_meta[layers_length - 1].output_scale;
|
---|
497 | task->output_bias = layer_meta[layers_length - 1].output_bias;
|
---|
498 | size_t i;
|
---|
499 | for(i = 0; i < layers_length; i++)
|
---|
500 | {
|
---|
501 | layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);
|
---|
502 | layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);
|
---|
503 | layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);
|
---|
504 | }
|
---|
505 |
|
---|
506 | if(meta)
|
---|
507 | *meta = layer_meta;
|
---|
508 | return 0;
|
---|
509 | }
|
---|
510 |
|
---|
511 | int kpu_start(kpu_task_t *task)
|
---|
512 | {
|
---|
513 | if(atomic_cas(&kpu_status, 0, 1))
|
---|
514 | return -1;
|
---|
515 |
|
---|
516 | task->remain_layers_length = task->layers_length;
|
---|
517 | task->remain_layers = task->layers;
|
---|
518 | kpu_data_input(task);
|
---|
519 | return 0;
|
---|
520 | }
|
---|
521 |
|
---|
522 | static void kpu_send_layer(const kpu_layer_argument_t *layer)
|
---|
523 | {
|
---|
524 | kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
|
---|
525 | kpu->layer_argument_fifo = layer->image_addr.reg;
|
---|
526 | kpu->layer_argument_fifo = layer->image_channel_num.reg;
|
---|
527 | kpu->layer_argument_fifo = layer->image_size.reg;
|
---|
528 | kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
|
---|
529 | kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
|
---|
530 | kpu->layer_argument_fifo = layer->kernel_offset.reg;
|
---|
531 | kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
|
---|
532 | kpu->layer_argument_fifo = layer->write_back_cfg.reg;
|
---|
533 | kpu->layer_argument_fifo = layer->conv_value.reg;
|
---|
534 | kpu->layer_argument_fifo = layer->conv_value2.reg;
|
---|
535 | kpu->layer_argument_fifo = layer->dma_parameter.reg;
|
---|
536 | }
|
---|
537 |
|
---|
538 | void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
|
---|
539 | {
|
---|
540 | kpu->interrupt_clear.reg = 7;
|
---|
541 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
542 | .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
|
---|
543 | kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
|
---|
544 | .eight_bit_mode = eight_bit_mode};
|
---|
545 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
546 | .calc_done_int = 1,
|
---|
547 | .layer_cfg_almost_empty_int = 0,
|
---|
548 | .layer_cfg_almost_full_int = 1};
|
---|
549 |
|
---|
550 | plic_set_priority(INTNO_AI, 1);
|
---|
551 | plic_irq_register(INTNO_AI, callback, userdata);
|
---|
552 | plic_irq_enable(INTNO_AI);
|
---|
553 | }
|
---|
554 |
|
---|
555 | void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
|
---|
556 | {
|
---|
557 | uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
|
---|
558 | dmac_set_irq(dma_ch, callback, userdata, 1);
|
---|
559 | dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
---|
560 | DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
|
---|
561 | }
|
---|
562 |
|
---|
563 | static void kpu_conv2d_core(kpu_layer_argument_t *layer)
|
---|
564 | {
|
---|
565 | kpu_send_layer(layer);
|
---|
566 | }
|
---|
567 |
|
---|
568 | void kpu_conv2d(kpu_layer_argument_t *layer)
|
---|
569 | {
|
---|
570 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
571 | .calc_done_int = 1,
|
---|
572 | .layer_cfg_almost_empty_int = 1,
|
---|
573 | .layer_cfg_almost_full_int = 1};
|
---|
574 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
575 | .calc_done_int = 1,
|
---|
576 | .layer_cfg_almost_empty_int = 0,
|
---|
577 | .layer_cfg_almost_full_int = 1};
|
---|
578 | kpu_conv2d_core(layer);
|
---|
579 | }
|
---|
580 |
|
---|
581 | void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
|
---|
582 | {
|
---|
583 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
584 | .calc_done_int = 1,
|
---|
585 | .layer_cfg_almost_empty_int = 1,
|
---|
586 | .layer_cfg_almost_full_int = 1};
|
---|
587 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
588 | .calc_done_int = 1,
|
---|
589 | .layer_cfg_almost_empty_int = 1,
|
---|
590 | .layer_cfg_almost_full_int = 1};
|
---|
591 | layer->dma_parameter.data.send_data_out = 1;
|
---|
592 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
593 | dmac_set_irq(dma_ch, callback, userdata, 1);
|
---|
594 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
595 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
|
---|
596 | kpu_conv2d_core(layer);
|
---|
597 | }
|
---|
598 |
|
---|
599 | void kpu_conv2d_output_full_add(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint64_t *dest, plic_irq_callback_t callback, void *userdata)
|
---|
600 | {
|
---|
601 | uint32_t channels = layer->image_channel_num.data.o_ch_num + 1;
|
---|
602 | layer->interrupt_enabe.data.full_add = 1;
|
---|
603 |
|
---|
604 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
605 | .calc_done_int = 1,
|
---|
606 | .layer_cfg_almost_empty_int = 1,
|
---|
607 | .layer_cfg_almost_full_int = 1};
|
---|
608 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
609 | .calc_done_int = 1,
|
---|
610 | .layer_cfg_almost_empty_int = 1,
|
---|
611 | .layer_cfg_almost_full_int = 1};
|
---|
612 | layer->dma_parameter.data.send_data_out = 1;
|
---|
613 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
614 | dmac_set_irq(dma_ch, callback, userdata, 1);
|
---|
615 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
616 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels);
|
---|
617 | kpu_conv2d_core(layer);
|
---|
618 | }
|
---|
619 |
|
---|
620 | void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, size_t count, uint8_t *dest, const quantize_param_t *dest_param)
|
---|
621 | {
|
---|
622 | quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param;
|
---|
623 |
|
---|
624 | size_t i;
|
---|
625 | for(i = 0; i < count; i++)
|
---|
626 | {
|
---|
627 | int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale;
|
---|
628 | if(value < 0)
|
---|
629 | value = 0;
|
---|
630 | if(value > 0xFF)
|
---|
631 | value = 0xFF;
|
---|
632 | *dest++ = value;
|
---|
633 | }
|
---|
634 | }
|
---|
635 |
|
---|
636 | void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
|
---|
637 | {
|
---|
638 | quantize_param_t q1 = *src_param, q2 = *dest_param;
|
---|
639 | size_t oc, y, x;
|
---|
640 |
|
---|
641 | if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
|
---|
642 | {
|
---|
643 | uint32_t row_padding = 16;
|
---|
644 | uint32_t row_group = 4;
|
---|
645 | uint32_t row_length = 1;
|
---|
646 | uint32_t height = 4;
|
---|
647 |
|
---|
648 | for(oc = 0; oc < channels; oc++)
|
---|
649 | {
|
---|
650 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
651 | for(y = 0; y < 1; y++)
|
---|
652 | {
|
---|
653 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
654 | for(x = 0; x < 1; x++)
|
---|
655 | {
|
---|
656 | int64_t sum = 0;
|
---|
657 | size_t i;
|
---|
658 | for(i = 0; i < kernel_size; i++)
|
---|
659 | sum += *src++;
|
---|
660 |
|
---|
661 | int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
|
---|
662 | if(value < 0)
|
---|
663 | value = 0;
|
---|
664 | if(value > 0xFF)
|
---|
665 | value = 0xFF;
|
---|
666 | y_origin[x] = value;
|
---|
667 | }
|
---|
668 | }
|
---|
669 | }
|
---|
670 | } else
|
---|
671 | {
|
---|
672 | for(oc = 0; oc < channels; oc++)
|
---|
673 | {
|
---|
674 | int64_t sum = 0;
|
---|
675 | size_t i;
|
---|
676 | for(i = 0; i < kernel_size; i++)
|
---|
677 | sum += *src++;
|
---|
678 |
|
---|
679 | int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
|
---|
680 | if(value < 0)
|
---|
681 | value = 0;
|
---|
682 | if(value > 0xFF)
|
---|
683 | value = 0xFF;
|
---|
684 | dest[oc] = value;
|
---|
685 | }
|
---|
686 | }
|
---|
687 | }
|
---|
688 |
|
---|
689 | void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
|
---|
690 | {
|
---|
691 | quantize_param_t q = *src_param;
|
---|
692 | size_t oc;
|
---|
693 |
|
---|
694 | for(oc = 0; oc < channels; oc++)
|
---|
695 | {
|
---|
696 | int64_t sum = 0;
|
---|
697 | size_t i;
|
---|
698 | for(i = 0; i < kernel_size; i++)
|
---|
699 | sum += *src++;
|
---|
700 |
|
---|
701 | float value = (sum * q.scale + q.bias) / kernel_size;
|
---|
702 | dest[oc] = value;
|
---|
703 | }
|
---|
704 | }
|
---|
705 |
|
---|
706 | void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param)
|
---|
707 | {
|
---|
708 | quantize_param_t q1 = *dest_param;
|
---|
709 | size_t i = 0;
|
---|
710 | for(i = 0; i < channels; i++)
|
---|
711 | *dest++ = src[i * 16] * q1.scale + q1.bias;
|
---|
712 | }
|
---|
713 |
|
---|
714 | void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels)
|
---|
715 | {
|
---|
716 | int ic, oc;
|
---|
717 | for(oc = 0; oc < output_channels; oc++)
|
---|
718 | {
|
---|
719 | const float *c_weights = weights + oc * input_channels;
|
---|
720 |
|
---|
721 | float sum = 0.0f;
|
---|
722 | for(ic = 0; ic < input_channels; ic++)
|
---|
723 | sum += src[ic] * c_weights[ic];
|
---|
724 | dest[oc] = sum + biases[oc];
|
---|
725 | }
|
---|
726 | }
|
---|
727 |
|
---|
728 | void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest)
|
---|
729 | {
|
---|
730 | quantize_param_t q1 = *src_param;
|
---|
731 | size_t i = 0;
|
---|
732 | for(i = 0; i < count; i++)
|
---|
733 | *dest++ = src[i] * q1.scale + q1.bias;
|
---|
734 | }
|
---|
735 |
|
---|
736 | void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels)
|
---|
737 | {
|
---|
738 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64);
|
---|
739 | size_t oc, y, x;
|
---|
740 |
|
---|
741 | uint32_t row_padding;
|
---|
742 | uint32_t row_group;
|
---|
743 | uint32_t row_length;
|
---|
744 |
|
---|
745 | if(width <= 16)
|
---|
746 | {
|
---|
747 | row_padding = 16;
|
---|
748 | row_group = 4;
|
---|
749 | row_length = 1;
|
---|
750 | } else if(width <= 32)
|
---|
751 | {
|
---|
752 | row_padding = 32;
|
---|
753 | row_group = 2;
|
---|
754 | row_length = 1;
|
---|
755 | } else
|
---|
756 | {
|
---|
757 | row_padding = 64;
|
---|
758 | row_group = 1;
|
---|
759 | row_length = (width + 63) / 64;
|
---|
760 | }
|
---|
761 |
|
---|
762 | for(oc = 0; oc < channels; oc++)
|
---|
763 | {
|
---|
764 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
765 | for(y = 0; y < height; y++)
|
---|
766 | {
|
---|
767 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
768 | for(x = 0; x < width; x++)
|
---|
769 | y_origin[x] = *src++;
|
---|
770 | }
|
---|
771 | }
|
---|
772 | }
|
---|
773 | #if USE_CACHED_AI_RAM
|
---|
774 | static void kpu_flush_cache(uint32_t addr, size_t lines)
|
---|
775 | {
|
---|
776 | size_t line;
|
---|
777 | for(line = 0; line < lines; line++)
|
---|
778 | {
|
---|
779 | const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
|
---|
780 | uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
|
---|
781 | size_t i;
|
---|
782 | for(i = 0; i < 8; i++)
|
---|
783 | dest[i] = src[i];
|
---|
784 | }
|
---|
785 | }
|
---|
786 | #endif
|
---|
787 | static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
|
---|
788 | {
|
---|
789 | if(shift > 0)
|
---|
790 | {
|
---|
791 | value >>= shift - 1;
|
---|
792 | if(value & 0x1)
|
---|
793 | {
|
---|
794 | if(value < 0)
|
---|
795 | value = (value >> 1) - 1;
|
---|
796 | else
|
---|
797 | value = (value >> 1) + 1;
|
---|
798 | } else
|
---|
799 | {
|
---|
800 | value >>= 1;
|
---|
801 | }
|
---|
802 | }
|
---|
803 |
|
---|
804 | return value;
|
---|
805 | }
|
---|
806 | static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
|
---|
807 | {
|
---|
808 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
|
---|
809 | size_t oc, y, x;
|
---|
810 | uint32_t row_padding;
|
---|
811 | uint32_t row_group;
|
---|
812 | uint32_t row_length;
|
---|
813 | if(width <= 16)
|
---|
814 | {
|
---|
815 | row_padding = 16;
|
---|
816 | row_group = 4;
|
---|
817 | row_length = 1;
|
---|
818 | } else if(width <= 32)
|
---|
819 | {
|
---|
820 | row_padding = 32;
|
---|
821 | row_group = 2;
|
---|
822 | row_length = 1;
|
---|
823 | } else
|
---|
824 | {
|
---|
825 | row_padding = 64;
|
---|
826 | row_group = 1;
|
---|
827 | row_length = (width + 63) / 64;
|
---|
828 | }
|
---|
829 |
|
---|
830 | if((uintptr_t)src % 8 == 0 && width % 8 == 0)
|
---|
831 | {
|
---|
832 | #define UPLOAD_BEGIN() \
|
---|
833 | for(oc = 0; oc < channels; oc++) \
|
---|
834 | { \
|
---|
835 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
|
---|
836 | for(y = 0; y < height; y++) \
|
---|
837 | { \
|
---|
838 | uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
|
---|
839 |
|
---|
840 | #define UPLOAD_END() \
|
---|
841 | } \
|
---|
842 | }
|
---|
843 |
|
---|
844 | width /= 8;
|
---|
845 | const uint64_t *u64_src = (const uint64_t *)src;
|
---|
846 | if(width == 1)
|
---|
847 | {
|
---|
848 | UPLOAD_BEGIN()
|
---|
849 | y_origin[0] = *u64_src++;
|
---|
850 | UPLOAD_END()
|
---|
851 | } else if(width == 2)
|
---|
852 | {
|
---|
853 | UPLOAD_BEGIN()
|
---|
854 | {
|
---|
855 | y_origin[0] = *u64_src++;
|
---|
856 | y_origin[1] = *u64_src++;
|
---|
857 | }
|
---|
858 | UPLOAD_END()
|
---|
859 | } else if(width == 4)
|
---|
860 | {
|
---|
861 | UPLOAD_BEGIN()
|
---|
862 | {
|
---|
863 | y_origin[0] = *u64_src++;
|
---|
864 | y_origin[1] = *u64_src++;
|
---|
865 | y_origin[2] = *u64_src++;
|
---|
866 | y_origin[3] = *u64_src++;
|
---|
867 | }
|
---|
868 | UPLOAD_END()
|
---|
869 | } else
|
---|
870 | {
|
---|
871 | UPLOAD_BEGIN()
|
---|
872 | for(x = 0; x < width; x++)
|
---|
873 | y_origin[x] = *u64_src++;
|
---|
874 | UPLOAD_END()
|
---|
875 | }
|
---|
876 | } else
|
---|
877 | {
|
---|
878 | for(oc = 0; oc < channels; oc++)
|
---|
879 | {
|
---|
880 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
881 | for(y = 0; y < height; y++)
|
---|
882 | {
|
---|
883 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
884 | for(x = 0; x < width; x++)
|
---|
885 | y_origin[x] = *src++;
|
---|
886 | }
|
---|
887 | }
|
---|
888 | }
|
---|
889 | }
|
---|
890 | static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
|
---|
891 | {
|
---|
892 | size_t width = layer->image_size.data.i_row_wid + 1;
|
---|
893 | size_t height = layer->image_size.data.i_col_high + 1;
|
---|
894 | size_t channels = layer->image_channel_num.data.i_ch_num + 1;
|
---|
895 |
|
---|
896 | kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
|
---|
897 | }
|
---|
898 |
|
---|
899 | static void kpu_kmodel_input_float(const float *src, float *dest, size_t count)
|
---|
900 | {
|
---|
901 | memcpy(dest, src, count * sizeof(float));
|
---|
902 | }
|
---|
903 |
|
---|
904 | static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
|
---|
905 | {
|
---|
906 | size_t i;
|
---|
907 |
|
---|
908 | if(act == KLA_RELU)
|
---|
909 | {
|
---|
910 | for(i = 0; i < count; i++)
|
---|
911 | data[i] = max(data[i], 0);
|
---|
912 | } else if(act == KLA_RELU6)
|
---|
913 | {
|
---|
914 | for(i = 0; i < count; i++)
|
---|
915 | data[i] = min(max(data[i], 0), 6);
|
---|
916 | }
|
---|
917 | }
|
---|
918 |
|
---|
919 | static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
920 | {
|
---|
921 | const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
|
---|
922 | const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
|
---|
923 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
924 | size_t i, count = arg->count;
|
---|
925 |
|
---|
926 | for(i = 0; i < count; i++)
|
---|
927 | dest[i] = src_a[i] + src_b[i];
|
---|
928 | }
|
---|
929 |
|
---|
930 | static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
931 | {
|
---|
932 | const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
|
---|
933 | const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
|
---|
934 | size_t count = ALIGN_UP(arg->count, 8) / 8;
|
---|
935 | int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
|
---|
936 | int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
|
---|
937 | int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
|
---|
938 |
|
---|
939 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
940 | size_t i;
|
---|
941 |
|
---|
942 | if(sh_a == sh_b)
|
---|
943 | {
|
---|
944 | #define QADD_UNROLL_1(x) \
|
---|
945 | int64_t a##x = *src_a++; \
|
---|
946 | int64_t b##x = *src_b++;
|
---|
947 |
|
---|
948 | #define QADD_UNROLL_2(x) \
|
---|
949 | a##x += off_a; \
|
---|
950 | b##x += off_b;
|
---|
951 |
|
---|
952 | #define QADD_UNROLL_3(x) \
|
---|
953 | a##x *= mul_a; \
|
---|
954 | b##x *= mul_b;
|
---|
955 |
|
---|
956 | #define QADD_UNROLL_4(x) \
|
---|
957 | int64_t v##x = a##x + b##x;
|
---|
958 |
|
---|
959 | #define QADD_UNROLL_5(x) \
|
---|
960 | v##x >>= sh_a;
|
---|
961 |
|
---|
962 | #define QADD_UNROLL_6(x) \
|
---|
963 | v##x *= mul_o;
|
---|
964 |
|
---|
965 | #define QADD_UNROLL_7(x) \
|
---|
966 | v##x = kpu_carry_shift(v##x, sh_o);
|
---|
967 |
|
---|
968 | #define QADD_UNROLL_8(x) \
|
---|
969 | v##x += off_o;
|
---|
970 |
|
---|
971 | #define QADD_UNROLL_9(x) \
|
---|
972 | v##x = min(0xFF, max(0, v##x));
|
---|
973 |
|
---|
974 | #define QADD_UNROLL_10(x) \
|
---|
975 | *dest++ = v##x;
|
---|
976 |
|
---|
977 | #define QADD_UNROLL_S(x) \
|
---|
978 | QADD_UNROLL_##x(0) \
|
---|
979 | QADD_UNROLL_##x(1) \
|
---|
980 | QADD_UNROLL_##x(2) \
|
---|
981 | QADD_UNROLL_##x(3) \
|
---|
982 | QADD_UNROLL_##x(4) \
|
---|
983 | QADD_UNROLL_##x(5) \
|
---|
984 | QADD_UNROLL_##x(6) \
|
---|
985 | QADD_UNROLL_##x(7)
|
---|
986 |
|
---|
987 | for(i = 0; i < count; i++)
|
---|
988 | {
|
---|
989 | QADD_UNROLL_S(1);
|
---|
990 | QADD_UNROLL_S(2);
|
---|
991 | QADD_UNROLL_S(3);
|
---|
992 | QADD_UNROLL_S(4);
|
---|
993 | QADD_UNROLL_S(5);
|
---|
994 | QADD_UNROLL_S(6);
|
---|
995 | QADD_UNROLL_S(7);
|
---|
996 | QADD_UNROLL_S(8);
|
---|
997 | QADD_UNROLL_S(9);
|
---|
998 | QADD_UNROLL_S(10);
|
---|
999 | }
|
---|
1000 | } else
|
---|
1001 | {
|
---|
1002 | #undef QADD_UNROLL_1
|
---|
1003 | #define QADD_UNROLL_1(x) \
|
---|
1004 | int64_t a##x = *src_a++; \
|
---|
1005 | int64_t b##x = *src_b++;
|
---|
1006 |
|
---|
1007 | #undef QADD_UNROLL_2
|
---|
1008 | #define QADD_UNROLL_2(x) \
|
---|
1009 | a##x += off_a; \
|
---|
1010 | b##x += off_b;
|
---|
1011 |
|
---|
1012 | #undef QADD_UNROLL_3
|
---|
1013 | #define QADD_UNROLL_3(x) \
|
---|
1014 | a##x *= mul_a; \
|
---|
1015 | b##x *= mul_b;
|
---|
1016 |
|
---|
1017 | #undef QADD_UNROLL_4
|
---|
1018 | #define QADD_UNROLL_4(x) \
|
---|
1019 | a##x >>= sh_a; \
|
---|
1020 | b##x >>= sh_b;
|
---|
1021 |
|
---|
1022 | #undef QADD_UNROLL_5
|
---|
1023 | #define QADD_UNROLL_5(x) \
|
---|
1024 | int64_t v##x = a##x + b##x;
|
---|
1025 |
|
---|
1026 | #undef QADD_UNROLL_6
|
---|
1027 | #define QADD_UNROLL_6(x) \
|
---|
1028 | v##x *= mul_o;
|
---|
1029 |
|
---|
1030 | #undef QADD_UNROLL_7
|
---|
1031 | #define QADD_UNROLL_7(x) \
|
---|
1032 | v##x = kpu_carry_shift(v##x, sh_o);
|
---|
1033 |
|
---|
1034 | #undef QADD_UNROLL_8
|
---|
1035 | #define QADD_UNROLL_8(x) \
|
---|
1036 | v##x += off_o;
|
---|
1037 |
|
---|
1038 | #undef QADD_UNROLL_9
|
---|
1039 | #define QADD_UNROLL_9(x) \
|
---|
1040 | v##x = min(0xFF, max(0, v##x));
|
---|
1041 |
|
---|
1042 | #undef QADD_UNROLL_10
|
---|
1043 | #define QADD_UNROLL_10(x) \
|
---|
1044 | *dest++ = v##x;
|
---|
1045 |
|
---|
1046 | #undef QADD_UNROLL_S
|
---|
1047 | #define QADD_UNROLL_S(x) \
|
---|
1048 | QADD_UNROLL_##x(0) \
|
---|
1049 | QADD_UNROLL_##x(1) \
|
---|
1050 | QADD_UNROLL_##x(2) \
|
---|
1051 | QADD_UNROLL_##x(3) \
|
---|
1052 | QADD_UNROLL_##x(4) \
|
---|
1053 | QADD_UNROLL_##x(5) \
|
---|
1054 | QADD_UNROLL_##x(6) \
|
---|
1055 | QADD_UNROLL_##x(7)
|
---|
1056 |
|
---|
1057 | for(i = 0; i < count; i++)
|
---|
1058 | {
|
---|
1059 | QADD_UNROLL_S(1);
|
---|
1060 | QADD_UNROLL_S(2);
|
---|
1061 | QADD_UNROLL_S(3);
|
---|
1062 | QADD_UNROLL_S(4);
|
---|
1063 | QADD_UNROLL_S(5);
|
---|
1064 | QADD_UNROLL_S(6);
|
---|
1065 | QADD_UNROLL_S(7);
|
---|
1066 | QADD_UNROLL_S(8);
|
---|
1067 | QADD_UNROLL_S(9);
|
---|
1068 | QADD_UNROLL_S(10);
|
---|
1069 | }
|
---|
1070 | }
|
---|
1071 | }
|
---|
1072 |
|
---|
1073 | static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1074 | {
|
---|
1075 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1076 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1077 | size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
|
---|
1078 |
|
---|
1079 | for(oc = 0; oc < channels; oc++)
|
---|
1080 | {
|
---|
1081 | float sum = 0.f;
|
---|
1082 | size_t i;
|
---|
1083 | for(i = 0; i < kernel_size; i++)
|
---|
1084 | sum += *src++;
|
---|
1085 |
|
---|
1086 | dest[oc] = sum / kernel_size;
|
---|
1087 | }
|
---|
1088 | }
|
---|
1089 |
|
---|
1090 | static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1091 | {
|
---|
1092 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1093 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1094 | kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
|
---|
1095 | uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
|
---|
1096 | uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
|
---|
1097 | uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
|
---|
1098 |
|
---|
1099 | uint32_t out_y, out_x, oc;
|
---|
1100 |
|
---|
1101 | for(oc = 0; oc < out_shape.channels; oc++)
|
---|
1102 | {
|
---|
1103 | const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
1104 | for(out_y = 0; out_y < out_shape.height; out_y++)
|
---|
1105 | {
|
---|
1106 | for(out_x = 0; out_x < out_shape.width; out_x++)
|
---|
1107 | {
|
---|
1108 | int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
|
---|
1109 | int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
|
---|
1110 | int32_t kernel_x_start = max(0, -in_x_origin);
|
---|
1111 | int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
|
---|
1112 | int32_t kernel_y_start = max(0, -in_y_origin);
|
---|
1113 | int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
|
---|
1114 | uint8_t value = 0;
|
---|
1115 |
|
---|
1116 | int32_t kernel_y, kernel_x;
|
---|
1117 | for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
|
---|
1118 | {
|
---|
1119 | for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
|
---|
1120 | {
|
---|
1121 | int32_t in_x = in_x_origin + kernel_x;
|
---|
1122 | int32_t in_y = in_y_origin + kernel_y;
|
---|
1123 | value = max(value, channel_src[in_y * in_shape.width + in_x]);
|
---|
1124 | }
|
---|
1125 | }
|
---|
1126 |
|
---|
1127 | *dest++ = value;
|
---|
1128 | }
|
---|
1129 | }
|
---|
1130 | }
|
---|
1131 | }
|
---|
1132 |
|
---|
1133 | static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1134 | {
|
---|
1135 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1136 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1137 | kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
|
---|
1138 | uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
|
---|
1139 | uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
|
---|
1140 | uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
|
---|
1141 |
|
---|
1142 | uint32_t out_y, out_x, oc;
|
---|
1143 |
|
---|
1144 | for(oc = 0; oc < out_shape.channels; oc++)
|
---|
1145 | {
|
---|
1146 | const float *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
1147 | for(out_y = 0; out_y < out_shape.height; out_y++)
|
---|
1148 | {
|
---|
1149 | for(out_x = 0; out_x < out_shape.width; out_x++)
|
---|
1150 | {
|
---|
1151 | int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
|
---|
1152 | int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
|
---|
1153 | int32_t kernel_x_start = max(0, -in_x_origin);
|
---|
1154 | int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
|
---|
1155 | int32_t kernel_y_start = max(0, -in_y_origin);
|
---|
1156 | int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
|
---|
1157 | float value = 0;
|
---|
1158 | float kernel_count = 0;
|
---|
1159 |
|
---|
1160 | int32_t kernel_y, kernel_x;
|
---|
1161 | for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
|
---|
1162 | {
|
---|
1163 | for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
|
---|
1164 | {
|
---|
1165 | int32_t in_x = in_x_origin + kernel_x;
|
---|
1166 | int32_t in_y = in_y_origin + kernel_y;
|
---|
1167 | value += channel_src[in_y * in_shape.width + in_x];
|
---|
1168 | kernel_count++;
|
---|
1169 | }
|
---|
1170 | }
|
---|
1171 |
|
---|
1172 | *dest++ = value / kernel_count;
|
---|
1173 | }
|
---|
1174 | }
|
---|
1175 | }
|
---|
1176 | }
|
---|
1177 |
|
---|
1178 | static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1179 | {
|
---|
1180 | size_t count = arg->count;
|
---|
1181 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1182 |
|
---|
1183 | kpu_model_quant_param_t q = arg->quant_param;
|
---|
1184 |
|
---|
1185 | float scale = 1.f / q.scale;
|
---|
1186 |
|
---|
1187 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
|
---|
1188 | size_t i;
|
---|
1189 | for(i = 0; i < count; i++)
|
---|
1190 | {
|
---|
1191 | int value = roundf((*src++ - q.bias) * scale);
|
---|
1192 | if(value < 0)
|
---|
1193 | value = 0;
|
---|
1194 | if(value > 0xFF)
|
---|
1195 | value = 0xFF;
|
---|
1196 | *dest++ = (uint8_t)value;
|
---|
1197 | }
|
---|
1198 | }
|
---|
1199 |
|
---|
1200 | static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1201 | {
|
---|
1202 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1203 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1204 | size_t oc, count = arg->count;
|
---|
1205 | kpu_model_quant_param_t q = arg->quant_param;
|
---|
1206 |
|
---|
1207 | for(oc = 0; oc < count; oc++)
|
---|
1208 | dest[oc] = *src++ * q.scale + q.bias;
|
---|
1209 | }
|
---|
1210 |
|
---|
1211 | static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1212 | {
|
---|
1213 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1214 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1215 | size_t oc, i, channels = arg->channels, count = arg->channel_size;
|
---|
1216 |
|
---|
1217 | for(oc = 0; oc < channels; oc++)
|
---|
1218 | {
|
---|
1219 | const kpu_model_quant_param_t q = arg->quant_params[oc];
|
---|
1220 |
|
---|
1221 | for(i = 0; i < count; i++)
|
---|
1222 | *dest++ = *src++ * q.scale + q.bias;
|
---|
1223 | }
|
---|
1224 | }
|
---|
1225 |
|
---|
1226 | static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1227 | {
|
---|
1228 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1229 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1230 | size_t oc, count = arg->count;
|
---|
1231 | const uint8_t *table = arg->table;
|
---|
1232 |
|
---|
1233 | if(false && count % 8 == 0)
|
---|
1234 | {
|
---|
1235 | for(oc = 0; oc < count;)
|
---|
1236 | {
|
---|
1237 | dest[oc++] = table[*src++];
|
---|
1238 | dest[oc++] = table[*src++];
|
---|
1239 | dest[oc++] = table[*src++];
|
---|
1240 | dest[oc++] = table[*src++];
|
---|
1241 | dest[oc++] = table[*src++];
|
---|
1242 | dest[oc++] = table[*src++];
|
---|
1243 | dest[oc++] = table[*src++];
|
---|
1244 | dest[oc++] = table[*src++];
|
---|
1245 | }
|
---|
1246 | } else
|
---|
1247 | {
|
---|
1248 | for(oc = 0; oc < count; oc++)
|
---|
1249 | dest[oc] = table[src[oc]];
|
---|
1250 | }
|
---|
1251 | }
|
---|
1252 |
|
---|
1253 | static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1254 | {
|
---|
1255 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1256 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1257 | size_t oc, channels = arg->channels;
|
---|
1258 |
|
---|
1259 | float sum = 0.f;
|
---|
1260 | const float epsilon = 1e-10f;
|
---|
1261 | for(oc = 0; oc < channels; oc++)
|
---|
1262 | sum += src[oc] * src[oc];
|
---|
1263 | if(sum < epsilon)
|
---|
1264 | sum = epsilon;
|
---|
1265 | sum = 1.f / sqrtf(sum);
|
---|
1266 | for(oc = 0; oc < channels; oc++)
|
---|
1267 | dest[oc] = src[oc] * sum;
|
---|
1268 | }
|
---|
1269 |
|
---|
1270 | static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1271 | {
|
---|
1272 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1273 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1274 | size_t oc, channels = arg->channels;
|
---|
1275 |
|
---|
1276 | float max = FLT_MIN;
|
---|
1277 | for(oc = 0; oc < channels; oc++)
|
---|
1278 | max = fmaxf(max, src[oc]);
|
---|
1279 |
|
---|
1280 | float sum = 0.f;
|
---|
1281 | for(oc = 0; oc < channels; oc++)
|
---|
1282 | {
|
---|
1283 | float value = expf(src[oc] - max);
|
---|
1284 | sum += value;
|
---|
1285 | dest[oc] = value;
|
---|
1286 | }
|
---|
1287 |
|
---|
1288 | for(oc = 0; oc < channels; oc++)
|
---|
1289 | dest[oc] /= sum;
|
---|
1290 | }
|
---|
1291 |
|
---|
1292 | static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1293 | {
|
---|
1294 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1295 | uint32_t count = arg->input_count, i;
|
---|
1296 |
|
---|
1297 | for(i = 0; i < count; i++)
|
---|
1298 | {
|
---|
1299 | kpu_model_memory_range_t input = arg->inputs_mem[i];
|
---|
1300 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
|
---|
1301 | memcpy(dest, src, input.size);
|
---|
1302 | dest += input.size;
|
---|
1303 | }
|
---|
1304 | }
|
---|
1305 |
|
---|
1306 | static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1307 | {
|
---|
1308 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1309 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1310 | uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
|
---|
1311 | float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
|
---|
1312 | float *bias = (float *)malloc(out_channels * sizeof(float));
|
---|
1313 | memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
|
---|
1314 | memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
|
---|
1315 |
|
---|
1316 | if(in_channels % 8 == 0)
|
---|
1317 | {
|
---|
1318 | #define FC_UNROLL_1(x) \
|
---|
1319 | float i##x = *c_src++; \
|
---|
1320 | float w##x = *c_weights++;
|
---|
1321 |
|
---|
1322 | #define FC_UNROLL_2(x) \
|
---|
1323 | sum += i##x * w##x;
|
---|
1324 |
|
---|
1325 | #define FC_UNROLL_S(x) \
|
---|
1326 | FC_UNROLL_##x(0) \
|
---|
1327 | FC_UNROLL_##x(1) \
|
---|
1328 | FC_UNROLL_##x(2) \
|
---|
1329 | FC_UNROLL_##x(3) \
|
---|
1330 | FC_UNROLL_##x(4) \
|
---|
1331 | FC_UNROLL_##x(5) \
|
---|
1332 | FC_UNROLL_##x(6) \
|
---|
1333 | FC_UNROLL_##x(7)
|
---|
1334 |
|
---|
1335 | for(oc = 0; oc < out_channels; oc++)
|
---|
1336 | {
|
---|
1337 | const float *c_src = src;
|
---|
1338 | const float *c_weights = weights + oc * in_channels;
|
---|
1339 |
|
---|
1340 | float sum = 0.0f;
|
---|
1341 | for(ic = 0; ic < in_channels / 8; ic++)
|
---|
1342 | {
|
---|
1343 | FC_UNROLL_S(1);
|
---|
1344 | FC_UNROLL_S(2);
|
---|
1345 | }
|
---|
1346 |
|
---|
1347 | dest[oc] = sum + bias[oc];
|
---|
1348 | }
|
---|
1349 | } else
|
---|
1350 | {
|
---|
1351 | for(oc = 0; oc < out_channels; oc++)
|
---|
1352 | {
|
---|
1353 | const float *c_weights = weights + oc * in_channels;
|
---|
1354 |
|
---|
1355 | float sum = 0.0f;
|
---|
1356 | for(ic = 0; ic < in_channels; ic++)
|
---|
1357 | sum += src[ic] * c_weights[ic];
|
---|
1358 | dest[oc] = sum + bias[oc];
|
---|
1359 | }
|
---|
1360 | }
|
---|
1361 | free(weights);
|
---|
1362 | free(bias);
|
---|
1363 | kpu_float_activation(dest, out_channels, arg->act);
|
---|
1364 | }
|
---|
1365 |
|
---|
1366 | static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1367 | {
|
---|
1368 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1369 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1370 | kpu_model_shape_t in_shape = arg->shape;
|
---|
1371 | uint32_t oc, oy, ox;
|
---|
1372 |
|
---|
1373 | for(oy = 0; oy < in_shape.height; oy++)
|
---|
1374 | for(ox = 0; ox < in_shape.width; ox++)
|
---|
1375 | for(oc = 0; oc < in_shape.channels; oc++)
|
---|
1376 | *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
|
---|
1377 | }
|
---|
1378 |
|
---|
1379 | static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1380 | {
|
---|
1381 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1382 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1383 | kpu_model_shape_t in_shape = arg->in_shape;
|
---|
1384 | uint32_t out_width = arg->out_width, out_height = arg->out_height;
|
---|
1385 | uint32_t oc, oy, ox;
|
---|
1386 |
|
---|
1387 | float height_scale = (float)in_shape.height / out_height;
|
---|
1388 | float width_scale = (float)in_shape.width / out_width;
|
---|
1389 |
|
---|
1390 | for(oc = 0; oc < in_shape.channels; oc++)
|
---|
1391 | {
|
---|
1392 | const float *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
1393 | for(oy = 0; oy < out_height; oy++)
|
---|
1394 | {
|
---|
1395 | uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
|
---|
1396 | const float *y_origin = channel_src + in_y * in_shape.width;
|
---|
1397 | for(ox = 0; ox < out_width; ox++)
|
---|
1398 | {
|
---|
1399 | uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
|
---|
1400 | *dest++ = y_origin[in_x];
|
---|
1401 | }
|
---|
1402 | }
|
---|
1403 | }
|
---|
1404 | }
|
---|
1405 |
|
---|
1406 | static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1407 | {
|
---|
1408 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1409 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1410 | kpu_model_shape_t in_shape = arg->in_shape;
|
---|
1411 | uint32_t out_width = arg->out_width, out_height = arg->out_height;
|
---|
1412 | uint32_t oc, oy, ox;
|
---|
1413 |
|
---|
1414 | float height_scale = (float)in_shape.height / out_height;
|
---|
1415 | float width_scale = (float)in_shape.width / out_width;
|
---|
1416 |
|
---|
1417 | for(oc = 0; oc < in_shape.channels; oc++)
|
---|
1418 | {
|
---|
1419 | const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
1420 | for(oy = 0; oy < out_height; oy++)
|
---|
1421 | {
|
---|
1422 | uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
|
---|
1423 | const uint8_t *y_origin = channel_src + in_y * in_shape.width;
|
---|
1424 | for(ox = 0; ox < out_width; ox++)
|
---|
1425 | {
|
---|
1426 | uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
|
---|
1427 | *dest++ = y_origin[in_x];
|
---|
1428 | }
|
---|
1429 | }
|
---|
1430 | }
|
---|
1431 | }
|
---|
1432 |
|
---|
1433 | static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1434 | {
|
---|
1435 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1436 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1437 | size_t oc, channels = arg->channels;
|
---|
1438 |
|
---|
1439 | for(oc = 0; oc < channels; oc++)
|
---|
1440 | dest[oc] = 1.f / (1.f + expf(-src[oc]));
|
---|
1441 | }
|
---|
1442 |
|
---|
1443 | static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1444 | {
|
---|
1445 | volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
|
---|
1446 | layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
|
---|
1447 | layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
|
---|
1448 | layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
|
---|
1449 |
|
---|
1450 | if(arg->flags & KLF_MAIN_MEM_OUT)
|
---|
1451 | {
|
---|
1452 | dmac_channel_number_t dma_ch = ctx->dma_ch;
|
---|
1453 | uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
|
---|
1454 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
1455 | .calc_done_int = 1,
|
---|
1456 | .layer_cfg_almost_empty_int = 1,
|
---|
1457 | .layer_cfg_almost_full_int = 1};
|
---|
1458 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
1459 | .calc_done_int = 1,
|
---|
1460 | .layer_cfg_almost_empty_int = 1,
|
---|
1461 | .layer_cfg_almost_full_int = 1};
|
---|
1462 | layer.dma_parameter.data.send_data_out = 1;
|
---|
1463 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
1464 | if(ctx->current_layer < ctx->layers_length)
|
---|
1465 | dmac_set_irq(dma_ch, ai_step, ctx, 1);
|
---|
1466 | else
|
---|
1467 | dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
|
---|
1468 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
1469 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
|
---|
1470 | } else
|
---|
1471 | {
|
---|
1472 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
1473 | .calc_done_int = 1,
|
---|
1474 | .layer_cfg_almost_empty_int = 1,
|
---|
1475 | .layer_cfg_almost_full_int = 1};
|
---|
1476 |
|
---|
1477 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
1478 | .calc_done_int = 0,
|
---|
1479 | .layer_cfg_almost_empty_int = 1,
|
---|
1480 | .layer_cfg_almost_full_int = 1};
|
---|
1481 | layer.interrupt_enabe.data.int_en = 1;
|
---|
1482 | }
|
---|
1483 |
|
---|
1484 | kpu_send_layer((const kpu_layer_argument_t *)&layer);
|
---|
1485 | }
|
---|
1486 |
|
---|
1487 | static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1488 | {
|
---|
1489 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1490 | #if USE_CACHED_AI_RAM
|
---|
1491 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
|
---|
1492 | #else
|
---|
1493 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
|
---|
1494 | #endif
|
---|
1495 |
|
---|
1496 | uint32_t row_padding = 16;
|
---|
1497 | uint32_t row_group = 4;
|
---|
1498 | uint32_t row_length = 1;
|
---|
1499 | uint32_t height = 4;
|
---|
1500 | uint32_t oc, x, y, channels = arg->channels;
|
---|
1501 |
|
---|
1502 | for(oc = 0; oc < channels; oc++)
|
---|
1503 | {
|
---|
1504 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
1505 | for(y = 0; y < 1; y++)
|
---|
1506 | {
|
---|
1507 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
1508 | for(x = 0; x < 1; x++)
|
---|
1509 | y_origin[x] = *src++;
|
---|
1510 | }
|
---|
1511 | }
|
---|
1512 |
|
---|
1513 | #if USE_CACHED_AI_RAM
|
---|
1514 | uint32_t lines = row_length * height * channels / row_group;
|
---|
1515 | kpu_flush_cache(arg->kpu_mem_out_address, lines);
|
---|
1516 | #endif
|
---|
1517 | }
|
---|
1518 |
|
---|
1519 | static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1520 | {
|
---|
1521 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1522 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1523 | uint32_t oc, channels = arg->channels;
|
---|
1524 |
|
---|
1525 | for(oc = 0; oc < channels; oc++)
|
---|
1526 | *dest++ = src[oc * 16];
|
---|
1527 | }
|
---|
1528 |
|
---|
1529 | static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1530 | {
|
---|
1531 | size_t width = arg->width;
|
---|
1532 | size_t height = arg->height;
|
---|
1533 | size_t channels = arg->channels;
|
---|
1534 |
|
---|
1535 | kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
|
---|
1536 | }
|
---|
1537 |
|
---|
1538 | int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
---|
1539 | {
|
---|
1540 | #if FIX_CACHE
|
---|
1541 | configASSERT(is_memory_cache((uintptr_t)buffer));
|
---|
1542 | #endif
|
---|
1543 | uintptr_t base_addr = (uintptr_t)buffer;
|
---|
1544 | const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
|
---|
1545 |
|
---|
1546 | if (header->version == 3 && header->arch == 0)
|
---|
1547 | {
|
---|
1548 | ctx->is_nncase = 0;
|
---|
1549 | ctx->model_buffer = buffer;
|
---|
1550 | ctx->output_count = header->output_count;
|
---|
1551 | ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
|
---|
1552 | ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
|
---|
1553 | ctx->layers_length = header->layers_length;
|
---|
1554 | ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
|
---|
1555 | ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
|
---|
1556 | if (!ctx->main_buffer)
|
---|
1557 | return -1;
|
---|
1558 | uint32_t body_size = 0;
|
---|
1559 | for (int i=0; i<ctx->layers_length; i++)
|
---|
1560 | {
|
---|
1561 | const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
|
---|
1562 | body_size += cnt_layer_header->body_size;
|
---|
1563 | }
|
---|
1564 | uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
|
---|
1565 | const uint8_t *body_start_cache = ctx->body_start;
|
---|
1566 | memcpy(body_start_iomem, body_start_cache, body_size);
|
---|
1567 | for (int i=0; i<body_size; i++)
|
---|
1568 | {
|
---|
1569 | configASSERT(body_start_iomem[i] == body_start_cache[i]);
|
---|
1570 | }
|
---|
1571 |
|
---|
1572 | } else
|
---|
1573 | {
|
---|
1574 | return -1;
|
---|
1575 | }
|
---|
1576 |
|
---|
1577 | return 0;
|
---|
1578 | }
|
---|
1579 |
|
---|
1580 | int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
|
---|
1581 | {
|
---|
1582 | if(ctx->is_nncase)
|
---|
1583 | return -1;
|
---|
1584 |
|
---|
1585 | if(index >= ctx->output_count)
|
---|
1586 | return -1;
|
---|
1587 |
|
---|
1588 | const kpu_model_output_t *output = ctx->outputs + index;
|
---|
1589 | *data = ctx->main_buffer + output->address;
|
---|
1590 | *size = output->size;
|
---|
1591 | return 0;
|
---|
1592 | }
|
---|
1593 |
|
---|
1594 | void kpu_model_free(kpu_model_context_t *ctx)
|
---|
1595 | {
|
---|
1596 | if(ctx->is_nncase)
|
---|
1597 | return;
|
---|
1598 |
|
---|
1599 | free(ctx->main_buffer);
|
---|
1600 | ctx->main_buffer = NULL;
|
---|
1601 | }
|
---|
1602 |
|
---|
1603 | #if KPU_DEBUG
|
---|
1604 | static uint64_t last_time;
|
---|
1605 | static uint64_t total_time;
|
---|
1606 | static uint64_t kpu_time;
|
---|
1607 | static uint32_t last_layer_type;
|
---|
1608 |
|
---|
1609 | static const char *str_layer_type(uint32_t type)
|
---|
1610 | {
|
---|
1611 | switch(type)
|
---|
1612 | {
|
---|
1613 | case KL_ADD:
|
---|
1614 | return "Add";
|
---|
1615 | case KL_QUANTIZED_ADD:
|
---|
1616 | return "QuantAdd";
|
---|
1617 | case KL_GLOBAL_AVERAGE_POOL2D:
|
---|
1618 | return "GAP";
|
---|
1619 | case KL_QUANTIZED_MAX_POOL2D:
|
---|
1620 | return "QuantMaxPool2d";
|
---|
1621 | case KL_AVERAGE_POOL2D:
|
---|
1622 | return "AveragePool2d";
|
---|
1623 | case KL_QUANTIZE:
|
---|
1624 | return "Quantize";
|
---|
1625 | case KL_DEQUANTIZE:
|
---|
1626 | return "Dequantize";
|
---|
1627 | case KL_REQUANTIZE:
|
---|
1628 | return "Requantize";
|
---|
1629 | case KL_L2_NORMALIZATION:
|
---|
1630 | return "L2Norm";
|
---|
1631 | case KL_SOFTMAX:
|
---|
1632 | return "Softmax";
|
---|
1633 | case KL_CONCAT:
|
---|
1634 | return "Concat";
|
---|
1635 | case KL_QUANTIZED_CONCAT:
|
---|
1636 | return "QuantConcat";
|
---|
1637 | case KL_FULLY_CONNECTED:
|
---|
1638 | return "FullyConnected";
|
---|
1639 | case KL_TENSORFLOW_FLATTEN:
|
---|
1640 | return "TFFlatten";
|
---|
1641 | case KL_RESIZE_NEAREST_NEIGHBOR:
|
---|
1642 | return "ResizeNearestNeighbor";
|
---|
1643 | case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
|
---|
1644 | return "QuantResizeNearestNeighbor";
|
---|
1645 | case KL_CHANNELWISE_DEQUANTIZE:
|
---|
1646 | return "ChannelwiseDequantize";
|
---|
1647 | case KL_LOGISTIC:
|
---|
1648 | return "Logistic";
|
---|
1649 | case KL_K210_CONV:
|
---|
1650 | return "K210Conv";
|
---|
1651 | case KL_K210_ADD_PADDING:
|
---|
1652 | return "K210AddPad";
|
---|
1653 | case KL_K210_REMOVE_PADDING:
|
---|
1654 | return "K210RemovePad";
|
---|
1655 | case KL_K210_UPLOAD:
|
---|
1656 | return "K210Upload";
|
---|
1657 | default:
|
---|
1658 | return "Unknown";
|
---|
1659 | }
|
---|
1660 | }
|
---|
1661 | #endif
|
---|
1662 |
|
---|
1663 | static int kpu_kmodel_done(kpu_model_context_t *ctx)
|
---|
1664 | {
|
---|
1665 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
1666 | .calc_done_int = 1,
|
---|
1667 | .layer_cfg_almost_empty_int = 1,
|
---|
1668 | .layer_cfg_almost_full_int = 1};
|
---|
1669 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
1670 | .calc_done_int = 1,
|
---|
1671 | .layer_cfg_almost_empty_int = 1,
|
---|
1672 | .layer_cfg_almost_full_int = 1};
|
---|
1673 | #if KPU_DEBUG
|
---|
1674 | uint32_t cnt_layer_id = ctx->current_layer - 1;
|
---|
1675 | uint64_t time = sysctl_get_time_us();
|
---|
1676 | if(last_time != 0)
|
---|
1677 | {
|
---|
1678 | uint64_t layer_time = time - last_time;
|
---|
1679 | syslog(LOG_NOTICE, "layer %d [%s]: %f ms", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);
|
---|
1680 | total_time += layer_time;
|
---|
1681 | if(last_layer_type == KL_K210_CONV)
|
---|
1682 | kpu_time += layer_time;
|
---|
1683 | }
|
---|
1684 |
|
---|
1685 | syslog(LOG_NOTICE, "KPU: %f ms", kpu_time / 1000.0);
|
---|
1686 | syslog(LOG_NOTICE, "CPU: %f ms", (total_time - kpu_time) / 1000.0);
|
---|
1687 | syslog(LOG_NOTICE, "Model: %f ms", total_time / 1000.0);
|
---|
1688 | #endif
|
---|
1689 | ctx->done_callback(ctx->userdata);
|
---|
1690 | return 0;
|
---|
1691 | }
|
---|
1692 |
|
---|
1693 | static int ai_step(void *userdata)
|
---|
1694 | {
|
---|
1695 | kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
|
---|
1696 |
|
---|
1697 | uint32_t cnt_layer_id = ctx->current_layer;
|
---|
1698 | const uint8_t *layer_body = ctx->current_body;
|
---|
1699 | const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
|
---|
1700 | if (cnt_layer_id >= ctx->layers_length) {
|
---|
1701 | //syslog(LOG_NOTICE, "overrun");
|
---|
1702 | kpu_kmodel_done(ctx);
|
---|
1703 | return -1;
|
---|
1704 | }
|
---|
1705 |
|
---|
1706 | ctx->current_layer++;
|
---|
1707 | ctx->current_body += cnt_layer_header->body_size;
|
---|
1708 |
|
---|
1709 | #if KPU_DEBUG
|
---|
1710 | uint64_t time = sysctl_get_time_us();
|
---|
1711 | if(last_time != 0)
|
---|
1712 | {
|
---|
1713 | uint64_t layer_time = time - last_time;
|
---|
1714 | syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
|
---|
1715 | total_time += layer_time;
|
---|
1716 | if(last_layer_type == KL_K210_CONV)
|
---|
1717 | kpu_time += layer_time;
|
---|
1718 | }
|
---|
1719 |
|
---|
1720 | last_layer_type = cnt_layer_header->type;
|
---|
1721 | last_time = sysctl_get_time_us();
|
---|
1722 | #endif
|
---|
1723 |
|
---|
1724 | switch(cnt_layer_header->type)
|
---|
1725 | {
|
---|
1726 | case KL_ADD:
|
---|
1727 | kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
|
---|
1728 | break;
|
---|
1729 | case KL_QUANTIZED_ADD:
|
---|
1730 | kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
|
---|
1731 | break;
|
---|
1732 | case KL_GLOBAL_AVERAGE_POOL2D:
|
---|
1733 | kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
|
---|
1734 | break;
|
---|
1735 | case KL_QUANTIZED_MAX_POOL2D:
|
---|
1736 | kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
|
---|
1737 | break;
|
---|
1738 | case KL_AVERAGE_POOL2D:
|
---|
1739 | kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
|
---|
1740 | break;
|
---|
1741 | case KL_QUANTIZE:
|
---|
1742 | kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
|
---|
1743 | break;
|
---|
1744 | case KL_DEQUANTIZE:
|
---|
1745 | kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
|
---|
1746 | break;
|
---|
1747 | case KL_REQUANTIZE:
|
---|
1748 | kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
|
---|
1749 | break;
|
---|
1750 | case KL_L2_NORMALIZATION:
|
---|
1751 | kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
|
---|
1752 | break;
|
---|
1753 | case KL_SOFTMAX:
|
---|
1754 | kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
|
---|
1755 | break;
|
---|
1756 | case KL_CONCAT:
|
---|
1757 | case KL_QUANTIZED_CONCAT:
|
---|
1758 | kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
|
---|
1759 | break;
|
---|
1760 | case KL_FULLY_CONNECTED:
|
---|
1761 | kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
|
---|
1762 | break;
|
---|
1763 | case KL_TENSORFLOW_FLATTEN:
|
---|
1764 | kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
|
---|
1765 | break;
|
---|
1766 | case KL_RESIZE_NEAREST_NEIGHBOR:
|
---|
1767 | kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
|
---|
1768 | break;
|
---|
1769 | case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
|
---|
1770 | kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
|
---|
1771 | break;
|
---|
1772 | case KL_CHANNELWISE_DEQUANTIZE:
|
---|
1773 | kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
|
---|
1774 | break;
|
---|
1775 | case KL_LOGISTIC:
|
---|
1776 | kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
|
---|
1777 | break;
|
---|
1778 | case KL_K210_CONV:
|
---|
1779 | kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
|
---|
1780 | return 0;
|
---|
1781 | case KL_K210_ADD_PADDING:
|
---|
1782 | kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
|
---|
1783 | break;
|
---|
1784 | case KL_K210_REMOVE_PADDING:
|
---|
1785 | kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
|
---|
1786 | break;
|
---|
1787 | case KL_K210_UPLOAD:
|
---|
1788 | kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
|
---|
1789 | break;
|
---|
1790 | default:
|
---|
1791 | assert(!"Layer is not supported.");
|
---|
1792 | kpu_kmodel_done(ctx);
|
---|
1793 | return -1;
|
---|
1794 | }
|
---|
1795 |
|
---|
1796 | if (ctx->current_layer < ctx->layers_length)
|
---|
1797 | ai_step(userdata);
|
---|
1798 | else
|
---|
1799 | kpu_kmodel_done(ctx);
|
---|
1800 | return 0;
|
---|
1801 | }
|
---|
1802 |
|
---|
1803 | static void ai_step_not_isr(void *userdata)
|
---|
1804 | {
|
---|
1805 | sysctl_disable_irq();
|
---|
1806 | ai_step(userdata);
|
---|
1807 | sysctl_enable_irq();
|
---|
1808 | }
|
---|
1809 |
|
---|
1810 | int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
|
---|
1811 | {
|
---|
1812 | if(ctx->is_nncase)
|
---|
1813 | return -1;
|
---|
1814 |
|
---|
1815 | ctx->dma_ch = dma_ch;
|
---|
1816 | ctx->done_callback = done_callback;
|
---|
1817 | ctx->userdata = userdata;
|
---|
1818 | ctx->current_layer = 0;
|
---|
1819 | ctx->current_body = ctx->body_start;
|
---|
1820 | #if KPU_DEBUG
|
---|
1821 | last_time = 0;
|
---|
1822 | total_time = 0;
|
---|
1823 | kpu_time = 0;
|
---|
1824 | #endif
|
---|
1825 |
|
---|
1826 | kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
|
---|
1827 | kpu->interrupt_clear.reg = 7;
|
---|
1828 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
1829 | .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
|
---|
1830 | kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
|
---|
1831 | .eight_bit_mode = header->flags & 1};
|
---|
1832 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
1833 | .calc_done_int = 1,
|
---|
1834 | .layer_cfg_almost_empty_int = 0,
|
---|
1835 | .layer_cfg_almost_full_int = 1};
|
---|
1836 |
|
---|
1837 | plic_set_priority(INTNO_AI, 1);
|
---|
1838 | plic_irq_register(INTNO_AI, ai_step, ctx);
|
---|
1839 | plic_irq_enable(INTNO_AI);
|
---|
1840 |
|
---|
1841 | const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
|
---|
1842 |
|
---|
1843 | switch(first_layer_header->type)
|
---|
1844 | {
|
---|
1845 | case KL_K210_CONV:
|
---|
1846 | {
|
---|
1847 | const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
|
---|
1848 | kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
|
---|
1849 |
|
---|
1850 | if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
|
---|
1851 | {
|
---|
1852 | kpu_kmodel_input_with_padding(&layer_arg, src);
|
---|
1853 | ai_step_not_isr(ctx);
|
---|
1854 | } else
|
---|
1855 | {
|
---|
1856 | kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
|
---|
1857 | }
|
---|
1858 | }
|
---|
1859 | break;
|
---|
1860 | case KL_FULLY_CONNECTED:
|
---|
1861 | {
|
---|
1862 | const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
|
---|
1863 | kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
|
---|
1864 | ai_step_not_isr(ctx);
|
---|
1865 | }
|
---|
1866 | break;
|
---|
1867 | default:
|
---|
1868 | return -1;
|
---|
1869 | }
|
---|
1870 |
|
---|
1871 | return 0;
|
---|
1872 | }
|
---|