1 | #include <assert.h>
|
---|
2 | #include <float.h>
|
---|
3 | #include <math.h>
|
---|
4 | #include <stdio.h>
|
---|
5 | #include <stdlib.h>
|
---|
6 | #include <string.h>
|
---|
7 | #include <stdint.h>
|
---|
8 | #include <kernel.h>
|
---|
9 | #include <t_syslog.h>
|
---|
10 | #include <t_stdlib.h>
|
---|
11 | #include <kernel_impl.h>
|
---|
12 | #include <target_syssvc.h>
|
---|
13 | #include "kendryte-k210.h"
|
---|
14 | #include "device.h"
|
---|
15 | #include "atomic.h"
|
---|
16 | #include "kpu.h"
|
---|
17 | #include "utils.h"
|
---|
18 | #include "kpu_main.h"
|
---|
19 | #include "kernel_cfg.h"
|
---|
20 |
|
---|
21 | #define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) | (b))
|
---|
22 |
|
---|
23 | uint64_t sysctl_get_time_us(void)
|
---|
24 | {
|
---|
25 | uint64_t v_cycle = read_cycle();
|
---|
26 | return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
|
---|
27 | }
|
---|
28 |
|
---|
29 | static int is_memory(uintptr_t address)
|
---|
30 | {
|
---|
31 | enum
|
---|
32 | {
|
---|
33 | mem_len = 6 * 1024 * 1024,
|
---|
34 | mem_no_cache_len = 8 * 1024 * 1024,
|
---|
35 | };
|
---|
36 | return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) || ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) || (address == 0x50450040);
|
---|
37 | }
|
---|
38 |
|
---|
39 | uint32_t is_memory_cache(uintptr_t address)
|
---|
40 | {
|
---|
41 | #define MEM_CACHE_LEN (6 * 1024 * 1024)
|
---|
42 |
|
---|
43 | return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
|
---|
44 | }
|
---|
45 |
|
---|
46 | int plic_irq_enable(INTNO irq_number)
|
---|
47 | {
|
---|
48 | if (irq_number != INTNO_AI)
|
---|
49 | return -1;
|
---|
50 | ena_int(irq_number);
|
---|
51 | return 0;
|
---|
52 | }
|
---|
53 |
|
---|
54 | int plic_set_priority(INTNO irq_number, uint32_t priority)
|
---|
55 | {
|
---|
56 | if (irq_number != INTNO_AI)
|
---|
57 | return -1;
|
---|
58 | set_ipriority(irq_number, priority);
|
---|
59 | return 0;
|
---|
60 | }
|
---|
61 |
|
---|
62 | plic_irq_callback_t ai_done_callback;
|
---|
63 | void *ai_done_ctx;
|
---|
64 |
|
---|
65 | void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
|
---|
66 | {
|
---|
67 | if (irq != INTNO_AI)
|
---|
68 | return;
|
---|
69 |
|
---|
70 | dis_int(INTNO_AI);
|
---|
71 |
|
---|
72 | ai_done_callback = callback;
|
---|
73 | ai_done_ctx = ctx;
|
---|
74 |
|
---|
75 | ena_int(INTNO_AI);
|
---|
76 | }
|
---|
77 |
|
---|
78 | void ai_done_isr(intptr_t exinf)
|
---|
79 | {
|
---|
80 | dis_int(INTNO_AI);
|
---|
81 | if (ai_done_callback != NULL)
|
---|
82 | {
|
---|
83 | ai_done_callback(ai_done_ctx);
|
---|
84 | }
|
---|
85 | ena_int(INTNO_AI);
|
---|
86 | }
|
---|
87 |
|
---|
88 | plic_irq_callback_t ai_dma_done_callback;
|
---|
89 | void *ai_dma_done_ctx;
|
---|
90 |
|
---|
91 | void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
|
---|
92 | plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
|
---|
93 | {
|
---|
94 | if (channel_num != AI_DMA_CH)
|
---|
95 | return;
|
---|
96 |
|
---|
97 | //set_ipriority(INTNO_DMAAI, priority);
|
---|
98 |
|
---|
99 | dis_int(INTNO_DMAAI);
|
---|
100 |
|
---|
101 | ai_dma_done_callback = dmac_callback;
|
---|
102 | ai_dma_done_ctx = ctx;
|
---|
103 |
|
---|
104 | ena_int(INTNO_DMAAI);
|
---|
105 | }
|
---|
106 |
|
---|
107 | void ai_dma_done_isr(DMA_Handle_t *dma)
|
---|
108 | {
|
---|
109 | dis_int(INTNO_DMAAI);
|
---|
110 |
|
---|
111 | if (ai_dma_done_callback != NULL)
|
---|
112 | {
|
---|
113 | ai_dma_done_callback(ai_dma_done_ctx);
|
---|
114 | }
|
---|
115 |
|
---|
116 | ena_int(INTNO_DMAAI);
|
---|
117 | }
|
---|
118 |
|
---|
119 | void dmac_set_irq(dmac_channel_number_t channel_num,
|
---|
120 | plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
|
---|
121 | {
|
---|
122 | if (channel_num != AI_DMA_CH)
|
---|
123 | return;
|
---|
124 |
|
---|
125 | //set_ipriority(INTNO_DMAAI, priority);
|
---|
126 |
|
---|
127 | dis_int(INTNO_DMAAI);
|
---|
128 |
|
---|
129 | ai_dma_done_callback = dmac_callback;
|
---|
130 | ai_dma_done_ctx = ctx;
|
---|
131 |
|
---|
132 | ena_int(INTNO_DMAAI);
|
---|
133 | }
|
---|
134 |
|
---|
135 | DMA_Handle_t g_ai_hdma;
|
---|
136 |
|
---|
137 | void dmac_set_single_mode(dmac_channel_number_t channel_num,
|
---|
138 | const void *src, void *dest, uint8_t src_inc,
|
---|
139 | uint8_t dest_inc,
|
---|
140 | uint8_t dmac_burst_size,
|
---|
141 | uint8_t dmac_trans_width,
|
---|
142 | size_t block_size)
|
---|
143 | {
|
---|
144 | if (channel_num != AI_DMA_CH)
|
---|
145 | return;
|
---|
146 |
|
---|
147 | DMA_Handle_t *hdma = &g_ai_hdma;
|
---|
148 | int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
|
---|
149 | uint8_t flow_control;
|
---|
150 | if (mem_type_src == 0 && mem_type_dest == 0)
|
---|
151 | flow_control = DMA_PERIPH_TO_PERIPH;
|
---|
152 | else if (mem_type_src == 1 && mem_type_dest == 0)
|
---|
153 | flow_control = DMA_MEMORY_TO_PERIPH;
|
---|
154 | else if (mem_type_src == 0 && mem_type_dest == 1)
|
---|
155 | flow_control = DMA_PERIPH_TO_MEMORY;
|
---|
156 | else
|
---|
157 | flow_control = DMA_MEMORY_TO_MEMORY;
|
---|
158 |
|
---|
159 | hdma->Init.Direction = flow_control; /* DMA転送方向 */
|
---|
160 | hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
|
---|
161 | hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
|
---|
162 | hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
|
---|
163 | hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
|
---|
164 | hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
|
---|
165 | hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
|
---|
166 | hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
|
---|
167 | hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
|
---|
168 | dma_reset(hdma);
|
---|
169 |
|
---|
170 | dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
|
---|
171 | }
|
---|
172 |
|
---|
173 | #define LAYER_BURST_SIZE 12
|
---|
174 |
|
---|
175 | #define KPU_DEBUG 0
|
---|
176 | #define USE_CACHED_AI_RAM 0
|
---|
177 |
|
---|
178 | #define min(a, b) (((a) < (b)) ? (a) : (b))
|
---|
179 | #define max(a, b) (((a) > (b)) ? (a) : (b))
|
---|
180 | #define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
|
---|
181 |
|
---|
182 | static int ai_step(void *userdata);
|
---|
183 | static int kpu_kmodel_done(kpu_model_context_t *ctx);
|
---|
184 |
|
---|
185 | volatile kpu_config_t *const kpu = (volatile kpu_config_t *)AI_BASE_ADDR;
|
---|
186 | static volatile uint32_t kpu_status;
|
---|
187 |
|
---|
188 | static void kpu_send_layer(const kpu_layer_argument_t *layer)
|
---|
189 | {
|
---|
190 | kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
|
---|
191 | kpu->layer_argument_fifo = layer->image_addr.reg;
|
---|
192 | kpu->layer_argument_fifo = layer->image_channel_num.reg;
|
---|
193 | kpu->layer_argument_fifo = layer->image_size.reg;
|
---|
194 | kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
|
---|
195 | kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
|
---|
196 | kpu->layer_argument_fifo = layer->kernel_offset.reg;
|
---|
197 | kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
|
---|
198 | kpu->layer_argument_fifo = layer->write_back_cfg.reg;
|
---|
199 | kpu->layer_argument_fifo = layer->conv_value.reg;
|
---|
200 | kpu->layer_argument_fifo = layer->conv_value2.reg;
|
---|
201 | kpu->layer_argument_fifo = layer->dma_parameter.reg;
|
---|
202 | }
|
---|
203 |
|
---|
204 | void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
|
---|
205 | {
|
---|
206 | uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
|
---|
207 | dmac_set_irq(dma_ch, callback, userdata, 1);
|
---|
208 | dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
|
---|
209 | DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
|
---|
210 | }
|
---|
211 |
|
---|
212 | static void kpu_conv2d_core(kpu_layer_argument_t *layer)
|
---|
213 | {
|
---|
214 | kpu_send_layer(layer);
|
---|
215 | }
|
---|
216 |
|
---|
217 | void kpu_conv2d(kpu_layer_argument_t *layer)
|
---|
218 | {
|
---|
219 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
220 | .calc_done_int = 1,
|
---|
221 | .layer_cfg_almost_empty_int = 1,
|
---|
222 | .layer_cfg_almost_full_int = 1};
|
---|
223 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
224 | .calc_done_int = 1,
|
---|
225 | .layer_cfg_almost_empty_int = 0,
|
---|
226 | .layer_cfg_almost_full_int = 1};
|
---|
227 | kpu_conv2d_core(layer);
|
---|
228 | }
|
---|
229 |
|
---|
230 | void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
|
---|
231 | {
|
---|
232 | quantize_param_t q1 = *src_param, q2 = *dest_param;
|
---|
233 | size_t oc, y, x;
|
---|
234 |
|
---|
235 | if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
|
---|
236 | {
|
---|
237 | uint32_t row_padding = 16;
|
---|
238 | uint32_t row_group = 4;
|
---|
239 | uint32_t row_length = 1;
|
---|
240 | uint32_t height = 4;
|
---|
241 |
|
---|
242 | for (oc = 0; oc < channels; oc++)
|
---|
243 | {
|
---|
244 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
245 | for (y = 0; y < 1; y++)
|
---|
246 | {
|
---|
247 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
248 | for (x = 0; x < 1; x++)
|
---|
249 | {
|
---|
250 | int64_t sum = 0;
|
---|
251 | size_t i;
|
---|
252 | for (i = 0; i < kernel_size; i++)
|
---|
253 | sum += *src++;
|
---|
254 |
|
---|
255 | int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
|
---|
256 | if (value < 0)
|
---|
257 | value = 0;
|
---|
258 | if (value > 0xFF)
|
---|
259 | value = 0xFF;
|
---|
260 | y_origin[x] = value;
|
---|
261 | }
|
---|
262 | }
|
---|
263 | }
|
---|
264 | }
|
---|
265 | else
|
---|
266 | {
|
---|
267 | for (oc = 0; oc < channels; oc++)
|
---|
268 | {
|
---|
269 | int64_t sum = 0;
|
---|
270 | size_t i;
|
---|
271 | for (i = 0; i < kernel_size; i++)
|
---|
272 | sum += *src++;
|
---|
273 |
|
---|
274 | int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
|
---|
275 | if (value < 0)
|
---|
276 | value = 0;
|
---|
277 | if (value > 0xFF)
|
---|
278 | value = 0xFF;
|
---|
279 | dest[oc] = value;
|
---|
280 | }
|
---|
281 | }
|
---|
282 | }
|
---|
283 |
|
---|
284 | void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
|
---|
285 | {
|
---|
286 | quantize_param_t q = *src_param;
|
---|
287 | size_t oc;
|
---|
288 |
|
---|
289 | for (oc = 0; oc < channels; oc++)
|
---|
290 | {
|
---|
291 | int64_t sum = 0;
|
---|
292 | size_t i;
|
---|
293 | for (i = 0; i < kernel_size; i++)
|
---|
294 | sum += *src++;
|
---|
295 |
|
---|
296 | float value = (sum * q.scale + q.bias) / kernel_size;
|
---|
297 | dest[oc] = value;
|
---|
298 | }
|
---|
299 | }
|
---|
300 |
|
---|
301 | #if USE_CACHED_AI_RAM
|
---|
302 | static void kpu_flush_cache(uint32_t addr, size_t lines)
|
---|
303 | {
|
---|
304 | size_t line;
|
---|
305 | for (line = 0; line < lines; line++)
|
---|
306 | {
|
---|
307 | const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
|
---|
308 | uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
|
---|
309 | size_t i;
|
---|
310 | for (i = 0; i < 8; i++)
|
---|
311 | dest[i] = src[i];
|
---|
312 | }
|
---|
313 | }
|
---|
314 | #endif
|
---|
315 | static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
|
---|
316 | {
|
---|
317 | if (shift > 0)
|
---|
318 | {
|
---|
319 | value >>= shift - 1;
|
---|
320 | if (value & 0x1)
|
---|
321 | {
|
---|
322 | if (value < 0)
|
---|
323 | value = (value >> 1) - 1;
|
---|
324 | else
|
---|
325 | value = (value >> 1) + 1;
|
---|
326 | }
|
---|
327 | else
|
---|
328 | {
|
---|
329 | value >>= 1;
|
---|
330 | }
|
---|
331 | }
|
---|
332 |
|
---|
333 | return value;
|
---|
334 | }
|
---|
335 | static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
|
---|
336 | {
|
---|
337 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
|
---|
338 | size_t oc, y, x;
|
---|
339 | uint32_t row_padding;
|
---|
340 | uint32_t row_group;
|
---|
341 | uint32_t row_length;
|
---|
342 | if (width <= 16)
|
---|
343 | {
|
---|
344 | row_padding = 16;
|
---|
345 | row_group = 4;
|
---|
346 | row_length = 1;
|
---|
347 | }
|
---|
348 | else if (width <= 32)
|
---|
349 | {
|
---|
350 | row_padding = 32;
|
---|
351 | row_group = 2;
|
---|
352 | row_length = 1;
|
---|
353 | }
|
---|
354 | else
|
---|
355 | {
|
---|
356 | row_padding = 64;
|
---|
357 | row_group = 1;
|
---|
358 | row_length = (width + 63) / 64;
|
---|
359 | }
|
---|
360 |
|
---|
361 | if ((uintptr_t)src % 8 == 0 && width % 8 == 0)
|
---|
362 | {
|
---|
363 | #define UPLOAD_BEGIN() \
|
---|
364 | for (oc = 0; oc < channels; oc++) \
|
---|
365 | { \
|
---|
366 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
|
---|
367 | for (y = 0; y < height; y++) \
|
---|
368 | { \
|
---|
369 | uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
|
---|
370 |
|
---|
371 | #define UPLOAD_END() \
|
---|
372 | } \
|
---|
373 | }
|
---|
374 |
|
---|
375 | width /= 8;
|
---|
376 | const uint64_t *u64_src = (const uint64_t *)src;
|
---|
377 | if (width == 1)
|
---|
378 | {
|
---|
379 | UPLOAD_BEGIN()
|
---|
380 | y_origin[0] = *u64_src++;
|
---|
381 | UPLOAD_END()
|
---|
382 | }
|
---|
383 | else if (width == 2)
|
---|
384 | {
|
---|
385 | UPLOAD_BEGIN()
|
---|
386 | {
|
---|
387 | y_origin[0] = *u64_src++;
|
---|
388 | y_origin[1] = *u64_src++;
|
---|
389 | }
|
---|
390 | UPLOAD_END()
|
---|
391 | }
|
---|
392 | else if (width == 4)
|
---|
393 | {
|
---|
394 | UPLOAD_BEGIN()
|
---|
395 | {
|
---|
396 | y_origin[0] = *u64_src++;
|
---|
397 | y_origin[1] = *u64_src++;
|
---|
398 | y_origin[2] = *u64_src++;
|
---|
399 | y_origin[3] = *u64_src++;
|
---|
400 | }
|
---|
401 | UPLOAD_END()
|
---|
402 | }
|
---|
403 | else
|
---|
404 | {
|
---|
405 | UPLOAD_BEGIN()
|
---|
406 | for (x = 0; x < width; x++)
|
---|
407 | y_origin[x] = *u64_src++;
|
---|
408 | UPLOAD_END()
|
---|
409 | }
|
---|
410 | }
|
---|
411 | else
|
---|
412 | {
|
---|
413 | for (oc = 0; oc < channels; oc++)
|
---|
414 | {
|
---|
415 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
416 | for (y = 0; y < height; y++)
|
---|
417 | {
|
---|
418 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
419 | for (x = 0; x < width; x++)
|
---|
420 | y_origin[x] = *src++;
|
---|
421 | }
|
---|
422 | }
|
---|
423 | }
|
---|
424 | }
|
---|
425 | static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
|
---|
426 | {
|
---|
427 | size_t width = layer->image_size.data.i_row_wid + 1;
|
---|
428 | size_t height = layer->image_size.data.i_col_high + 1;
|
---|
429 | size_t channels = layer->image_channel_num.data.i_ch_num + 1;
|
---|
430 |
|
---|
431 | kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
|
---|
432 | }
|
---|
433 |
|
---|
434 | static void kpu_kmodel_input_float(const float *src, float *dest, size_t count)
|
---|
435 | {
|
---|
436 | memcpy(dest, src, count * sizeof(float));
|
---|
437 | }
|
---|
438 |
|
---|
439 | static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
|
---|
440 | {
|
---|
441 | size_t i;
|
---|
442 |
|
---|
443 | if (act == KLA_RELU)
|
---|
444 | {
|
---|
445 | for (i = 0; i < count; i++)
|
---|
446 | data[i] = max(data[i], 0);
|
---|
447 | }
|
---|
448 | else if (act == KLA_RELU6)
|
---|
449 | {
|
---|
450 | for (i = 0; i < count; i++)
|
---|
451 | data[i] = min(max(data[i], 0), 6);
|
---|
452 | }
|
---|
453 | }
|
---|
454 |
|
---|
455 | static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
456 | {
|
---|
457 | const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
|
---|
458 | const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
|
---|
459 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
460 | size_t i, count = arg->count;
|
---|
461 |
|
---|
462 | for (i = 0; i < count; i++)
|
---|
463 | dest[i] = src_a[i] + src_b[i];
|
---|
464 | }
|
---|
465 |
|
---|
466 | static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
467 | {
|
---|
468 | const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
|
---|
469 | const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
|
---|
470 | size_t count = ALIGN_UP(arg->count, 8) / 8;
|
---|
471 | int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
|
---|
472 | int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
|
---|
473 | int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
|
---|
474 |
|
---|
475 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
476 | size_t i;
|
---|
477 |
|
---|
478 | if (sh_a == sh_b)
|
---|
479 | {
|
---|
480 | #define QADD_UNROLL_1(x) \
|
---|
481 | int64_t a##x = *src_a++; \
|
---|
482 | int64_t b##x = *src_b++;
|
---|
483 |
|
---|
484 | #define QADD_UNROLL_2(x) \
|
---|
485 | a##x += off_a; \
|
---|
486 | b##x += off_b;
|
---|
487 |
|
---|
488 | #define QADD_UNROLL_3(x) \
|
---|
489 | a##x *= mul_a; \
|
---|
490 | b##x *= mul_b;
|
---|
491 |
|
---|
492 | #define QADD_UNROLL_4(x) \
|
---|
493 | int64_t v##x = a##x + b##x;
|
---|
494 |
|
---|
495 | #define QADD_UNROLL_5(x) \
|
---|
496 | v##x >>= sh_a;
|
---|
497 |
|
---|
498 | #define QADD_UNROLL_6(x) \
|
---|
499 | v##x *= mul_o;
|
---|
500 |
|
---|
501 | #define QADD_UNROLL_7(x) \
|
---|
502 | v##x = kpu_carry_shift(v##x, sh_o);
|
---|
503 |
|
---|
504 | #define QADD_UNROLL_8(x) \
|
---|
505 | v##x += off_o;
|
---|
506 |
|
---|
507 | #define QADD_UNROLL_9(x) \
|
---|
508 | v##x = min(0xFF, max(0, v##x));
|
---|
509 |
|
---|
510 | #define QADD_UNROLL_10(x) \
|
---|
511 | *dest++ = v##x;
|
---|
512 |
|
---|
513 | #define QADD_UNROLL_S(x) \
|
---|
514 | QADD_UNROLL_##x(0) \
|
---|
515 | QADD_UNROLL_##x(1) \
|
---|
516 | QADD_UNROLL_##x(2) \
|
---|
517 | QADD_UNROLL_##x(3) \
|
---|
518 | QADD_UNROLL_##x(4) \
|
---|
519 | QADD_UNROLL_##x(5) \
|
---|
520 | QADD_UNROLL_##x(6) \
|
---|
521 | QADD_UNROLL_##x(7)
|
---|
522 |
|
---|
523 | for (i = 0; i < count; i++)
|
---|
524 | {
|
---|
525 | QADD_UNROLL_S(1);
|
---|
526 | QADD_UNROLL_S(2);
|
---|
527 | QADD_UNROLL_S(3);
|
---|
528 | QADD_UNROLL_S(4);
|
---|
529 | QADD_UNROLL_S(5);
|
---|
530 | QADD_UNROLL_S(6);
|
---|
531 | QADD_UNROLL_S(7);
|
---|
532 | QADD_UNROLL_S(8);
|
---|
533 | QADD_UNROLL_S(9);
|
---|
534 | QADD_UNROLL_S(10);
|
---|
535 | }
|
---|
536 | }
|
---|
537 | else
|
---|
538 | {
|
---|
539 | #undef QADD_UNROLL_1
|
---|
540 | #define QADD_UNROLL_1(x) \
|
---|
541 | int64_t a##x = *src_a++; \
|
---|
542 | int64_t b##x = *src_b++;
|
---|
543 |
|
---|
544 | #undef QADD_UNROLL_2
|
---|
545 | #define QADD_UNROLL_2(x) \
|
---|
546 | a##x += off_a; \
|
---|
547 | b##x += off_b;
|
---|
548 |
|
---|
549 | #undef QADD_UNROLL_3
|
---|
550 | #define QADD_UNROLL_3(x) \
|
---|
551 | a##x *= mul_a; \
|
---|
552 | b##x *= mul_b;
|
---|
553 |
|
---|
554 | #undef QADD_UNROLL_4
|
---|
555 | #define QADD_UNROLL_4(x) \
|
---|
556 | a##x >>= sh_a; \
|
---|
557 | b##x >>= sh_b;
|
---|
558 |
|
---|
559 | #undef QADD_UNROLL_5
|
---|
560 | #define QADD_UNROLL_5(x) \
|
---|
561 | int64_t v##x = a##x + b##x;
|
---|
562 |
|
---|
563 | #undef QADD_UNROLL_6
|
---|
564 | #define QADD_UNROLL_6(x) \
|
---|
565 | v##x *= mul_o;
|
---|
566 |
|
---|
567 | #undef QADD_UNROLL_7
|
---|
568 | #define QADD_UNROLL_7(x) \
|
---|
569 | v##x = kpu_carry_shift(v##x, sh_o);
|
---|
570 |
|
---|
571 | #undef QADD_UNROLL_8
|
---|
572 | #define QADD_UNROLL_8(x) \
|
---|
573 | v##x += off_o;
|
---|
574 |
|
---|
575 | #undef QADD_UNROLL_9
|
---|
576 | #define QADD_UNROLL_9(x) \
|
---|
577 | v##x = min(0xFF, max(0, v##x));
|
---|
578 |
|
---|
579 | #undef QADD_UNROLL_10
|
---|
580 | #define QADD_UNROLL_10(x) \
|
---|
581 | *dest++ = v##x;
|
---|
582 |
|
---|
583 | #undef QADD_UNROLL_S
|
---|
584 | #define QADD_UNROLL_S(x) \
|
---|
585 | QADD_UNROLL_##x(0) \
|
---|
586 | QADD_UNROLL_##x(1) \
|
---|
587 | QADD_UNROLL_##x(2) \
|
---|
588 | QADD_UNROLL_##x(3) \
|
---|
589 | QADD_UNROLL_##x(4) \
|
---|
590 | QADD_UNROLL_##x(5) \
|
---|
591 | QADD_UNROLL_##x(6) \
|
---|
592 | QADD_UNROLL_##x(7)
|
---|
593 |
|
---|
594 | for (i = 0; i < count; i++)
|
---|
595 | {
|
---|
596 | QADD_UNROLL_S(1);
|
---|
597 | QADD_UNROLL_S(2);
|
---|
598 | QADD_UNROLL_S(3);
|
---|
599 | QADD_UNROLL_S(4);
|
---|
600 | QADD_UNROLL_S(5);
|
---|
601 | QADD_UNROLL_S(6);
|
---|
602 | QADD_UNROLL_S(7);
|
---|
603 | QADD_UNROLL_S(8);
|
---|
604 | QADD_UNROLL_S(9);
|
---|
605 | QADD_UNROLL_S(10);
|
---|
606 | }
|
---|
607 | }
|
---|
608 | }
|
---|
609 |
|
---|
610 | static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
611 | {
|
---|
612 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
613 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
614 | size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
|
---|
615 |
|
---|
616 | for (oc = 0; oc < channels; oc++)
|
---|
617 | {
|
---|
618 | float sum = 0.f;
|
---|
619 | size_t i;
|
---|
620 | for (i = 0; i < kernel_size; i++)
|
---|
621 | sum += *src++;
|
---|
622 |
|
---|
623 | dest[oc] = sum / kernel_size;
|
---|
624 | }
|
---|
625 | }
|
---|
626 |
|
---|
627 | static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
628 | {
|
---|
629 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
630 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
631 | kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
|
---|
632 | uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
|
---|
633 | uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
|
---|
634 | uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
|
---|
635 |
|
---|
636 | uint32_t out_y, out_x, oc;
|
---|
637 |
|
---|
638 | for (oc = 0; oc < out_shape.channels; oc++)
|
---|
639 | {
|
---|
640 | const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
641 | for (out_y = 0; out_y < out_shape.height; out_y++)
|
---|
642 | {
|
---|
643 | for (out_x = 0; out_x < out_shape.width; out_x++)
|
---|
644 | {
|
---|
645 | int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
|
---|
646 | int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
|
---|
647 | int32_t kernel_x_start = max(0, -in_x_origin);
|
---|
648 | int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
|
---|
649 | int32_t kernel_y_start = max(0, -in_y_origin);
|
---|
650 | int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
|
---|
651 | uint8_t value = 0;
|
---|
652 |
|
---|
653 | int32_t kernel_y, kernel_x;
|
---|
654 | for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
|
---|
655 | {
|
---|
656 | for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
|
---|
657 | {
|
---|
658 | int32_t in_x = in_x_origin + kernel_x;
|
---|
659 | int32_t in_y = in_y_origin + kernel_y;
|
---|
660 | value = max(value, channel_src[in_y * in_shape.width + in_x]);
|
---|
661 | }
|
---|
662 | }
|
---|
663 |
|
---|
664 | *dest++ = value;
|
---|
665 | }
|
---|
666 | }
|
---|
667 | }
|
---|
668 | }
|
---|
669 |
|
---|
670 | static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
671 | {
|
---|
672 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
673 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
674 | kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
|
---|
675 | uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
|
---|
676 | uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
|
---|
677 | uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
|
---|
678 |
|
---|
679 | uint32_t out_y, out_x, oc;
|
---|
680 |
|
---|
681 | for (oc = 0; oc < out_shape.channels; oc++)
|
---|
682 | {
|
---|
683 | const float *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
684 | for (out_y = 0; out_y < out_shape.height; out_y++)
|
---|
685 | {
|
---|
686 | for (out_x = 0; out_x < out_shape.width; out_x++)
|
---|
687 | {
|
---|
688 | int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
|
---|
689 | int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
|
---|
690 | int32_t kernel_x_start = max(0, -in_x_origin);
|
---|
691 | int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
|
---|
692 | int32_t kernel_y_start = max(0, -in_y_origin);
|
---|
693 | int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
|
---|
694 | float value = 0;
|
---|
695 | float kernel_count = 0;
|
---|
696 |
|
---|
697 | int32_t kernel_y, kernel_x;
|
---|
698 | for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
|
---|
699 | {
|
---|
700 | for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
|
---|
701 | {
|
---|
702 | int32_t in_x = in_x_origin + kernel_x;
|
---|
703 | int32_t in_y = in_y_origin + kernel_y;
|
---|
704 | value += channel_src[in_y * in_shape.width + in_x];
|
---|
705 | kernel_count++;
|
---|
706 | }
|
---|
707 | }
|
---|
708 |
|
---|
709 | *dest++ = value / kernel_count;
|
---|
710 | }
|
---|
711 | }
|
---|
712 | }
|
---|
713 | }
|
---|
714 |
|
---|
715 | static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
716 | {
|
---|
717 | size_t count = arg->count;
|
---|
718 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
719 |
|
---|
720 | kpu_model_quant_param_t q = arg->quant_param;
|
---|
721 |
|
---|
722 | float scale = 1.f / q.scale;
|
---|
723 |
|
---|
724 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
|
---|
725 | size_t i;
|
---|
726 | for (i = 0; i < count; i++)
|
---|
727 | {
|
---|
728 | int value = roundf((*src++ - q.bias) * scale);
|
---|
729 | if (value < 0)
|
---|
730 | value = 0;
|
---|
731 | if (value > 0xFF)
|
---|
732 | value = 0xFF;
|
---|
733 | *dest++ = (uint8_t)value;
|
---|
734 | }
|
---|
735 | }
|
---|
736 |
|
---|
737 | static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
738 | {
|
---|
739 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
740 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
741 | size_t oc, count = arg->count;
|
---|
742 | kpu_model_quant_param_t q = arg->quant_param;
|
---|
743 |
|
---|
744 | for (oc = 0; oc < count; oc++)
|
---|
745 | dest[oc] = *src++ * q.scale + q.bias;
|
---|
746 | }
|
---|
747 |
|
---|
748 | static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
|
---|
749 | {
|
---|
750 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
751 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
752 | size_t oc, i, channels = arg->channels, count = arg->channel_size;
|
---|
753 |
|
---|
754 | for (oc = 0; oc < channels; oc++)
|
---|
755 | {
|
---|
756 | const kpu_model_quant_param_t q = arg->quant_params[oc];
|
---|
757 |
|
---|
758 | for (i = 0; i < count; i++)
|
---|
759 | *dest++ = *src++ * q.scale + q.bias;
|
---|
760 | }
|
---|
761 | }
|
---|
762 |
|
---|
763 | static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
764 | {
|
---|
765 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
766 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
767 | size_t oc, count = arg->count;
|
---|
768 | const uint8_t *table = arg->table;
|
---|
769 |
|
---|
770 | if (false && count % 8 == 0)
|
---|
771 | {
|
---|
772 | for (oc = 0; oc < count;)
|
---|
773 | {
|
---|
774 | dest[oc++] = table[*src++];
|
---|
775 | dest[oc++] = table[*src++];
|
---|
776 | dest[oc++] = table[*src++];
|
---|
777 | dest[oc++] = table[*src++];
|
---|
778 | dest[oc++] = table[*src++];
|
---|
779 | dest[oc++] = table[*src++];
|
---|
780 | dest[oc++] = table[*src++];
|
---|
781 | dest[oc++] = table[*src++];
|
---|
782 | }
|
---|
783 | }
|
---|
784 | else
|
---|
785 | {
|
---|
786 | for (oc = 0; oc < count; oc++)
|
---|
787 | dest[oc] = table[src[oc]];
|
---|
788 | }
|
---|
789 | }
|
---|
790 |
|
---|
791 | static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
792 | {
|
---|
793 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
794 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
795 | size_t oc, channels = arg->channels;
|
---|
796 |
|
---|
797 | float sum = 0.f;
|
---|
798 | const float epsilon = 1e-10f;
|
---|
799 | for (oc = 0; oc < channels; oc++)
|
---|
800 | sum += src[oc] * src[oc];
|
---|
801 | if (sum < epsilon)
|
---|
802 | sum = epsilon;
|
---|
803 | sum = 1.f / sqrtf(sum);
|
---|
804 | for (oc = 0; oc < channels; oc++)
|
---|
805 | dest[oc] = src[oc] * sum;
|
---|
806 | }
|
---|
807 |
|
---|
808 | static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
809 | {
|
---|
810 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
811 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
812 | size_t oc, channels = arg->channels;
|
---|
813 |
|
---|
814 | float max = FLT_MIN;
|
---|
815 | for (oc = 0; oc < channels; oc++)
|
---|
816 | max = fmaxf(max, src[oc]);
|
---|
817 |
|
---|
818 | float sum = 0.f;
|
---|
819 | for (oc = 0; oc < channels; oc++)
|
---|
820 | {
|
---|
821 | float value = expf(src[oc] - max);
|
---|
822 | sum += value;
|
---|
823 | dest[oc] = value;
|
---|
824 | }
|
---|
825 |
|
---|
826 | for (oc = 0; oc < channels; oc++)
|
---|
827 | dest[oc] /= sum;
|
---|
828 | }
|
---|
829 |
|
---|
830 | static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
831 | {
|
---|
832 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
833 | uint32_t count = arg->input_count, i;
|
---|
834 |
|
---|
835 | for (i = 0; i < count; i++)
|
---|
836 | {
|
---|
837 | kpu_model_memory_range_t input = arg->inputs_mem[i];
|
---|
838 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
|
---|
839 | memcpy(dest, src, input.size);
|
---|
840 | dest += input.size;
|
---|
841 | }
|
---|
842 | }
|
---|
843 |
|
---|
844 | static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
845 | {
|
---|
846 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
847 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
848 | uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
|
---|
849 | float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
|
---|
850 | float *bias = (float *)malloc(out_channels * sizeof(float));
|
---|
851 | memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
|
---|
852 | memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
|
---|
853 |
|
---|
854 | if (in_channels % 8 == 0)
|
---|
855 | {
|
---|
856 | #define FC_UNROLL_1(x) \
|
---|
857 | float i##x = *c_src++; \
|
---|
858 | float w##x = *c_weights++;
|
---|
859 |
|
---|
860 | #define FC_UNROLL_2(x) \
|
---|
861 | sum += i##x * w##x;
|
---|
862 |
|
---|
863 | #define FC_UNROLL_S(x) \
|
---|
864 | FC_UNROLL_##x(0) \
|
---|
865 | FC_UNROLL_##x(1) \
|
---|
866 | FC_UNROLL_##x(2) \
|
---|
867 | FC_UNROLL_##x(3) \
|
---|
868 | FC_UNROLL_##x(4) \
|
---|
869 | FC_UNROLL_##x(5) \
|
---|
870 | FC_UNROLL_##x(6) \
|
---|
871 | FC_UNROLL_##x(7)
|
---|
872 |
|
---|
873 | for (oc = 0; oc < out_channels; oc++)
|
---|
874 | {
|
---|
875 | const float *c_src = src;
|
---|
876 | const float *c_weights = weights + oc * in_channels;
|
---|
877 |
|
---|
878 | float sum = 0.0f;
|
---|
879 | for (ic = 0; ic < in_channels / 8; ic++)
|
---|
880 | {
|
---|
881 | FC_UNROLL_S(1);
|
---|
882 | FC_UNROLL_S(2);
|
---|
883 | }
|
---|
884 |
|
---|
885 | dest[oc] = sum + bias[oc];
|
---|
886 | }
|
---|
887 | }
|
---|
888 | else
|
---|
889 | {
|
---|
890 | for (oc = 0; oc < out_channels; oc++)
|
---|
891 | {
|
---|
892 | const float *c_weights = weights + oc * in_channels;
|
---|
893 |
|
---|
894 | float sum = 0.0f;
|
---|
895 | for (ic = 0; ic < in_channels; ic++)
|
---|
896 | sum += src[ic] * c_weights[ic];
|
---|
897 | dest[oc] = sum + bias[oc];
|
---|
898 | }
|
---|
899 | }
|
---|
900 | free(weights);
|
---|
901 | free(bias);
|
---|
902 | kpu_float_activation(dest, out_channels, arg->act);
|
---|
903 | }
|
---|
904 |
|
---|
905 | static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
906 | {
|
---|
907 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
908 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
909 | kpu_model_shape_t in_shape = arg->shape;
|
---|
910 | uint32_t oc, oy, ox;
|
---|
911 |
|
---|
912 | for (oy = 0; oy < in_shape.height; oy++)
|
---|
913 | for (ox = 0; ox < in_shape.width; ox++)
|
---|
914 | for (oc = 0; oc < in_shape.channels; oc++)
|
---|
915 | *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
|
---|
916 | }
|
---|
917 |
|
---|
918 | static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
919 | {
|
---|
920 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
921 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
922 | kpu_model_shape_t in_shape = arg->in_shape;
|
---|
923 | uint32_t out_width = arg->out_width, out_height = arg->out_height;
|
---|
924 | uint32_t oc, oy, ox;
|
---|
925 |
|
---|
926 | float height_scale = (float)in_shape.height / out_height;
|
---|
927 | float width_scale = (float)in_shape.width / out_width;
|
---|
928 |
|
---|
929 | for (oc = 0; oc < in_shape.channels; oc++)
|
---|
930 | {
|
---|
931 | const float *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
932 | for (oy = 0; oy < out_height; oy++)
|
---|
933 | {
|
---|
934 | uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
|
---|
935 | const float *y_origin = channel_src + in_y * in_shape.width;
|
---|
936 | for (ox = 0; ox < out_width; ox++)
|
---|
937 | {
|
---|
938 | uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
|
---|
939 | *dest++ = y_origin[in_x];
|
---|
940 | }
|
---|
941 | }
|
---|
942 | }
|
---|
943 | }
|
---|
944 |
|
---|
945 | static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
946 | {
|
---|
947 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
948 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
949 | kpu_model_shape_t in_shape = arg->in_shape;
|
---|
950 | uint32_t out_width = arg->out_width, out_height = arg->out_height;
|
---|
951 | uint32_t oc, oy, ox;
|
---|
952 |
|
---|
953 | float height_scale = (float)in_shape.height / out_height;
|
---|
954 | float width_scale = (float)in_shape.width / out_width;
|
---|
955 |
|
---|
956 | for (oc = 0; oc < in_shape.channels; oc++)
|
---|
957 | {
|
---|
958 | const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
|
---|
959 | for (oy = 0; oy < out_height; oy++)
|
---|
960 | {
|
---|
961 | uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
|
---|
962 | const uint8_t *y_origin = channel_src + in_y * in_shape.width;
|
---|
963 | for (ox = 0; ox < out_width; ox++)
|
---|
964 | {
|
---|
965 | uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
|
---|
966 | *dest++ = y_origin[in_x];
|
---|
967 | }
|
---|
968 | }
|
---|
969 | }
|
---|
970 | }
|
---|
971 |
|
---|
972 | static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
973 | {
|
---|
974 | const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
975 | float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
976 | size_t oc, channels = arg->channels;
|
---|
977 |
|
---|
978 | for (oc = 0; oc < channels; oc++)
|
---|
979 | dest[oc] = 1.f / (1.f + expf(-src[oc]));
|
---|
980 | }
|
---|
981 |
|
---|
982 | static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
983 | {
|
---|
984 | volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
|
---|
985 | layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
|
---|
986 | layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
|
---|
987 | layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
|
---|
988 |
|
---|
989 | if (arg->flags & KLF_MAIN_MEM_OUT)
|
---|
990 | {
|
---|
991 | dmac_channel_number_t dma_ch = ctx->dma_ch;
|
---|
992 | uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
|
---|
993 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
994 | .calc_done_int = 1,
|
---|
995 | .layer_cfg_almost_empty_int = 1,
|
---|
996 | .layer_cfg_almost_full_int = 1};
|
---|
997 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
998 | .calc_done_int = 1,
|
---|
999 | .layer_cfg_almost_empty_int = 1,
|
---|
1000 | .layer_cfg_almost_full_int = 1};
|
---|
1001 | layer.dma_parameter.data.send_data_out = 1;
|
---|
1002 | select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
|
---|
1003 | if (ctx->current_layer < ctx->layers_length)
|
---|
1004 | dmac_set_irq(dma_ch, ai_step, ctx, 1);
|
---|
1005 | else
|
---|
1006 | dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
|
---|
1007 | dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
|
---|
1008 | DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
|
---|
1009 | }
|
---|
1010 | else
|
---|
1011 | {
|
---|
1012 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
1013 | .calc_done_int = 1,
|
---|
1014 | .layer_cfg_almost_empty_int = 1,
|
---|
1015 | .layer_cfg_almost_full_int = 1};
|
---|
1016 |
|
---|
1017 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
1018 | .calc_done_int = 0,
|
---|
1019 | .layer_cfg_almost_empty_int = 1,
|
---|
1020 | .layer_cfg_almost_full_int = 1};
|
---|
1021 | layer.interrupt_enabe.data.int_en = 1;
|
---|
1022 | }
|
---|
1023 |
|
---|
1024 | kpu_send_layer((const kpu_layer_argument_t *)&layer);
|
---|
1025 | }
|
---|
1026 |
|
---|
1027 | static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1028 | {
|
---|
1029 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1030 | #if USE_CACHED_AI_RAM
|
---|
1031 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
|
---|
1032 | #else
|
---|
1033 | uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
|
---|
1034 | #endif
|
---|
1035 |
|
---|
1036 | uint32_t row_padding = 16;
|
---|
1037 | uint32_t row_group = 4;
|
---|
1038 | uint32_t row_length = 1;
|
---|
1039 | uint32_t height = 4;
|
---|
1040 | uint32_t oc, x, y, channels = arg->channels;
|
---|
1041 |
|
---|
1042 | for (oc = 0; oc < channels; oc++)
|
---|
1043 | {
|
---|
1044 | uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
|
---|
1045 | for (y = 0; y < 1; y++)
|
---|
1046 | {
|
---|
1047 | uint8_t *y_origin = channel_origin + y * row_length * 64;
|
---|
1048 | for (x = 0; x < 1; x++)
|
---|
1049 | y_origin[x] = *src++;
|
---|
1050 | }
|
---|
1051 | }
|
---|
1052 |
|
---|
1053 | #if USE_CACHED_AI_RAM
|
---|
1054 | uint32_t lines = row_length * height * channels / row_group;
|
---|
1055 | kpu_flush_cache(arg->kpu_mem_out_address, lines);
|
---|
1056 | #endif
|
---|
1057 | }
|
---|
1058 |
|
---|
1059 | static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1060 | {
|
---|
1061 | const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
|
---|
1062 | uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
|
---|
1063 | uint32_t oc, channels = arg->channels;
|
---|
1064 |
|
---|
1065 | for (oc = 0; oc < channels; oc++)
|
---|
1066 | *dest++ = src[oc * 16];
|
---|
1067 | }
|
---|
1068 |
|
---|
1069 | static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
|
---|
1070 | {
|
---|
1071 | size_t width = arg->width;
|
---|
1072 | size_t height = arg->height;
|
---|
1073 | size_t channels = arg->channels;
|
---|
1074 |
|
---|
1075 | kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
|
---|
1076 | }
|
---|
1077 |
|
---|
1078 | int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
|
---|
1079 | {
|
---|
1080 | #if FIX_CACHE
|
---|
1081 | configASSERT(is_memory_cache((uintptr_t)buffer));
|
---|
1082 | #endif
|
---|
1083 | uintptr_t base_addr = (uintptr_t)buffer;
|
---|
1084 | const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
|
---|
1085 |
|
---|
1086 | if (header->version == 3 && header->arch == 0)
|
---|
1087 | {
|
---|
1088 | ctx->model_buffer = buffer;
|
---|
1089 | ctx->output_count = header->output_count;
|
---|
1090 | ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
|
---|
1091 | ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
|
---|
1092 | ctx->layers_length = header->layers_length;
|
---|
1093 | ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
|
---|
1094 | ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
|
---|
1095 | if (!ctx->main_buffer)
|
---|
1096 | return -1;
|
---|
1097 | uint32_t body_size = 0;
|
---|
1098 | for (int i = 0; i < ctx->layers_length; i++)
|
---|
1099 | {
|
---|
1100 | const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
|
---|
1101 | body_size += cnt_layer_header->body_size;
|
---|
1102 | }
|
---|
1103 | uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
|
---|
1104 | const uint8_t *body_start_cache = ctx->body_start;
|
---|
1105 | memcpy(body_start_iomem, body_start_cache, body_size);
|
---|
1106 | for (int i = 0; i < body_size; i++)
|
---|
1107 | {
|
---|
1108 | configASSERT(body_start_iomem[i] == body_start_cache[i]);
|
---|
1109 | }
|
---|
1110 | }
|
---|
1111 | else
|
---|
1112 | {
|
---|
1113 | return -1;
|
---|
1114 | }
|
---|
1115 |
|
---|
1116 | return 0;
|
---|
1117 | }
|
---|
1118 |
|
---|
1119 | int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
|
---|
1120 | {
|
---|
1121 | if (index >= ctx->output_count)
|
---|
1122 | return -1;
|
---|
1123 |
|
---|
1124 | const kpu_model_output_t *output = ctx->outputs + index;
|
---|
1125 | *data = ctx->main_buffer + output->address;
|
---|
1126 | *size = output->size;
|
---|
1127 | return 0;
|
---|
1128 | }
|
---|
1129 |
|
---|
1130 | void kpu_model_free(kpu_model_context_t *ctx)
|
---|
1131 | {
|
---|
1132 | free(ctx->main_buffer);
|
---|
1133 | ctx->main_buffer = NULL;
|
---|
1134 | }
|
---|
1135 |
|
---|
1136 | #if KPU_DEBUG
|
---|
1137 | static uint64_t last_time;
|
---|
1138 | static uint64_t total_time;
|
---|
1139 | static uint64_t kpu_time;
|
---|
1140 | static uint32_t last_layer_type;
|
---|
1141 |
|
---|
1142 | static const char *str_layer_type(uint32_t type)
|
---|
1143 | {
|
---|
1144 | switch (type)
|
---|
1145 | {
|
---|
1146 | case KL_ADD:
|
---|
1147 | return "Add";
|
---|
1148 | case KL_QUANTIZED_ADD:
|
---|
1149 | return "QuantAdd";
|
---|
1150 | case KL_GLOBAL_AVERAGE_POOL2D:
|
---|
1151 | return "GAP";
|
---|
1152 | case KL_QUANTIZED_MAX_POOL2D:
|
---|
1153 | return "QuantMaxPool2d";
|
---|
1154 | case KL_AVERAGE_POOL2D:
|
---|
1155 | return "AveragePool2d";
|
---|
1156 | case KL_QUANTIZE:
|
---|
1157 | return "Quantize";
|
---|
1158 | case KL_DEQUANTIZE:
|
---|
1159 | return "Dequantize";
|
---|
1160 | case KL_REQUANTIZE:
|
---|
1161 | return "Requantize";
|
---|
1162 | case KL_L2_NORMALIZATION:
|
---|
1163 | return "L2Norm";
|
---|
1164 | case KL_SOFTMAX:
|
---|
1165 | return "Softmax";
|
---|
1166 | case KL_CONCAT:
|
---|
1167 | return "Concat";
|
---|
1168 | case KL_QUANTIZED_CONCAT:
|
---|
1169 | return "QuantConcat";
|
---|
1170 | case KL_FULLY_CONNECTED:
|
---|
1171 | return "FullyConnected";
|
---|
1172 | case KL_TENSORFLOW_FLATTEN:
|
---|
1173 | return "TFFlatten";
|
---|
1174 | case KL_RESIZE_NEAREST_NEIGHBOR:
|
---|
1175 | return "ResizeNearestNeighbor";
|
---|
1176 | case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
|
---|
1177 | return "QuantResizeNearestNeighbor";
|
---|
1178 | case KL_CHANNELWISE_DEQUANTIZE:
|
---|
1179 | return "ChannelwiseDequantize";
|
---|
1180 | case KL_LOGISTIC:
|
---|
1181 | return "Logistic";
|
---|
1182 | case KL_K210_CONV:
|
---|
1183 | return "K210Conv";
|
---|
1184 | case KL_K210_ADD_PADDING:
|
---|
1185 | return "K210AddPad";
|
---|
1186 | case KL_K210_REMOVE_PADDING:
|
---|
1187 | return "K210RemovePad";
|
---|
1188 | case KL_K210_UPLOAD:
|
---|
1189 | return "K210Upload";
|
---|
1190 | default:
|
---|
1191 | return "Unknown";
|
---|
1192 | }
|
---|
1193 | }
|
---|
1194 | #endif
|
---|
1195 |
|
---|
1196 | static int kpu_kmodel_done(kpu_model_context_t *ctx)
|
---|
1197 | {
|
---|
1198 | kpu->interrupt_clear.data = (kpu_config_interrupt_t){
|
---|
1199 | .calc_done_int = 1,
|
---|
1200 | .layer_cfg_almost_empty_int = 1,
|
---|
1201 | .layer_cfg_almost_full_int = 1};
|
---|
1202 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
1203 | .calc_done_int = 1,
|
---|
1204 | .layer_cfg_almost_empty_int = 1,
|
---|
1205 | .layer_cfg_almost_full_int = 1};
|
---|
1206 | #if KPU_DEBUG
|
---|
1207 | uint32_t cnt_layer_id = ctx->current_layer;
|
---|
1208 | uint64_t time = sysctl_get_time_us();
|
---|
1209 | if (last_time != 0)
|
---|
1210 | {
|
---|
1211 | uint64_t layer_time = time - last_time;
|
---|
1212 | syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
|
---|
1213 | total_time += layer_time;
|
---|
1214 | if (last_layer_type == KL_K210_CONV)
|
---|
1215 | kpu_time += layer_time;
|
---|
1216 | }
|
---|
1217 |
|
---|
1218 | syslog(LOG_NOTICE, "KPU: %d.%03d ms", kpu_time / 1000, kpu_time % 1000);
|
---|
1219 | syslog(LOG_NOTICE, "CPU: %d.%03d ms", (total_time - kpu_time) / 1000, (total_time - kpu_time) % 1000);
|
---|
1220 | syslog(LOG_NOTICE, "Model: %d.%03d ms", total_time / 1000, total_time % 1000);
|
---|
1221 | #endif
|
---|
1222 | ctx->done_callback(ctx->userdata);
|
---|
1223 | return 0;
|
---|
1224 | }
|
---|
1225 |
|
---|
1226 | static int ai_step(void *userdata)
|
---|
1227 | {
|
---|
1228 | kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
|
---|
1229 |
|
---|
1230 | uint32_t cnt_layer_id = ctx->current_layer;
|
---|
1231 | const uint8_t *layer_body = ctx->current_body;
|
---|
1232 | const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
|
---|
1233 | if (cnt_layer_id >= ctx->layers_length)
|
---|
1234 | {
|
---|
1235 | //syslog(LOG_NOTICE, "overrun");
|
---|
1236 | kpu_kmodel_done(ctx);
|
---|
1237 | return -1;
|
---|
1238 | }
|
---|
1239 |
|
---|
1240 | ctx->current_layer++;
|
---|
1241 | ctx->current_body += cnt_layer_header->body_size;
|
---|
1242 |
|
---|
1243 | #if KPU_DEBUG
|
---|
1244 | uint64_t time = sysctl_get_time_us();
|
---|
1245 | if (last_time != 0)
|
---|
1246 | {
|
---|
1247 | uint64_t layer_time = time - last_time;
|
---|
1248 | syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
|
---|
1249 | total_time += layer_time;
|
---|
1250 | if (last_layer_type == KL_K210_CONV)
|
---|
1251 | kpu_time += layer_time;
|
---|
1252 | }
|
---|
1253 |
|
---|
1254 | last_layer_type = cnt_layer_header->type;
|
---|
1255 | last_time = sysctl_get_time_us();
|
---|
1256 | #endif
|
---|
1257 |
|
---|
1258 | switch (cnt_layer_header->type)
|
---|
1259 | {
|
---|
1260 | case KL_ADD:
|
---|
1261 | kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
|
---|
1262 | break;
|
---|
1263 | case KL_QUANTIZED_ADD:
|
---|
1264 | kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
|
---|
1265 | break;
|
---|
1266 | case KL_GLOBAL_AVERAGE_POOL2D:
|
---|
1267 | kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
|
---|
1268 | break;
|
---|
1269 | case KL_QUANTIZED_MAX_POOL2D:
|
---|
1270 | kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
|
---|
1271 | break;
|
---|
1272 | case KL_AVERAGE_POOL2D:
|
---|
1273 | kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
|
---|
1274 | break;
|
---|
1275 | case KL_QUANTIZE:
|
---|
1276 | kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
|
---|
1277 | break;
|
---|
1278 | case KL_DEQUANTIZE:
|
---|
1279 | kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
|
---|
1280 | break;
|
---|
1281 | case KL_REQUANTIZE:
|
---|
1282 | kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
|
---|
1283 | break;
|
---|
1284 | case KL_L2_NORMALIZATION:
|
---|
1285 | kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
|
---|
1286 | break;
|
---|
1287 | case KL_SOFTMAX:
|
---|
1288 | kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
|
---|
1289 | break;
|
---|
1290 | case KL_CONCAT:
|
---|
1291 | case KL_QUANTIZED_CONCAT:
|
---|
1292 | kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
|
---|
1293 | break;
|
---|
1294 | case KL_FULLY_CONNECTED:
|
---|
1295 | kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
|
---|
1296 | break;
|
---|
1297 | case KL_TENSORFLOW_FLATTEN:
|
---|
1298 | kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
|
---|
1299 | break;
|
---|
1300 | case KL_RESIZE_NEAREST_NEIGHBOR:
|
---|
1301 | kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
|
---|
1302 | break;
|
---|
1303 | case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
|
---|
1304 | kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
|
---|
1305 | break;
|
---|
1306 | case KL_CHANNELWISE_DEQUANTIZE:
|
---|
1307 | kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
|
---|
1308 | break;
|
---|
1309 | case KL_LOGISTIC:
|
---|
1310 | kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
|
---|
1311 | break;
|
---|
1312 | case KL_K210_CONV:
|
---|
1313 | kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
|
---|
1314 | return 0;
|
---|
1315 | case KL_K210_ADD_PADDING:
|
---|
1316 | kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
|
---|
1317 | break;
|
---|
1318 | case KL_K210_REMOVE_PADDING:
|
---|
1319 | kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
|
---|
1320 | break;
|
---|
1321 | case KL_K210_UPLOAD:
|
---|
1322 | kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
|
---|
1323 | break;
|
---|
1324 | default:
|
---|
1325 | assert(!"Layer is not supported.");
|
---|
1326 | kpu_kmodel_done(ctx);
|
---|
1327 | return -1;
|
---|
1328 | }
|
---|
1329 |
|
---|
1330 | if (ctx->current_layer < (ctx->layers_length - 1))
|
---|
1331 | ai_step(userdata);
|
---|
1332 | else
|
---|
1333 | kpu_kmodel_done(ctx);
|
---|
1334 | return 0;
|
---|
1335 | }
|
---|
1336 |
|
---|
1337 | static void ai_step_not_isr(void *userdata)
|
---|
1338 | {
|
---|
1339 | dis_int(INTNO_DMAAI);
|
---|
1340 | dis_int(INTNO_AI);
|
---|
1341 |
|
---|
1342 | ai_step(userdata);
|
---|
1343 |
|
---|
1344 | ena_int(INTNO_DMAAI);
|
---|
1345 | ena_int(INTNO_AI);
|
---|
1346 | }
|
---|
1347 |
|
---|
1348 | int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
|
---|
1349 | {
|
---|
1350 | ctx->dma_ch = dma_ch;
|
---|
1351 | ctx->done_callback = done_callback;
|
---|
1352 | ctx->userdata = userdata;
|
---|
1353 | ctx->current_layer = 0;
|
---|
1354 | ctx->current_body = ctx->body_start;
|
---|
1355 | #if KPU_DEBUG
|
---|
1356 | last_time = 0;
|
---|
1357 | total_time = 0;
|
---|
1358 | kpu_time = 0;
|
---|
1359 | #endif
|
---|
1360 |
|
---|
1361 | kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
|
---|
1362 | kpu->interrupt_clear.reg = 7;
|
---|
1363 | kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
|
---|
1364 | .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
|
---|
1365 | kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
|
---|
1366 | .eight_bit_mode = header->flags & 1};
|
---|
1367 | kpu->interrupt_mask.data = (kpu_config_interrupt_t){
|
---|
1368 | .calc_done_int = 1,
|
---|
1369 | .layer_cfg_almost_empty_int = 0,
|
---|
1370 | .layer_cfg_almost_full_int = 1};
|
---|
1371 |
|
---|
1372 | //plic_set_priority(INTNO_AI, 1);
|
---|
1373 | plic_irq_register(INTNO_AI, ai_step, ctx);
|
---|
1374 | plic_irq_enable(INTNO_AI);
|
---|
1375 |
|
---|
1376 | const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
|
---|
1377 |
|
---|
1378 | switch (first_layer_header->type)
|
---|
1379 | {
|
---|
1380 | case KL_K210_CONV:
|
---|
1381 | {
|
---|
1382 | const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
|
---|
1383 | kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
|
---|
1384 |
|
---|
1385 | if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
|
---|
1386 | {
|
---|
1387 | kpu_kmodel_input_with_padding(&layer_arg, src);
|
---|
1388 | ai_step_not_isr(ctx);
|
---|
1389 | }
|
---|
1390 | else
|
---|
1391 | {
|
---|
1392 | kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
|
---|
1393 | }
|
---|
1394 | }
|
---|
1395 | break;
|
---|
1396 | case KL_FULLY_CONNECTED:
|
---|
1397 | {
|
---|
1398 | const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
|
---|
1399 | kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
|
---|
1400 | ai_step_not_isr(ctx);
|
---|
1401 | }
|
---|
1402 | break;
|
---|
1403 | default:
|
---|
1404 | return -1;
|
---|
1405 | }
|
---|
1406 |
|
---|
1407 | return 0;
|
---|
1408 | }
|
---|
1409 |
|
---|
1410 | ER kpu_init(kpu_model_context_t *ctx)
|
---|
1411 | {
|
---|
1412 | g_ai_hdma.chnum = AI_DMA_CH;
|
---|
1413 | g_ai_hdma.xfercallback = ai_dma_done_isr;
|
---|
1414 | g_ai_hdma.errorcallback = NULL;
|
---|
1415 | g_ai_hdma.Init.Request = DMA_SELECT_AI_RX_REQ; /* DMA選択 */
|
---|
1416 | g_ai_hdma.Init.Direction = DMA_PERIPH_TO_MEMORY; /* DMA転送方向 */
|
---|
1417 | g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT; /* ソースマルチブロックタイプ */
|
---|
1418 | g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT; /* デスティネーションマルチブロックタイプ */
|
---|
1419 | g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE; /* ソースハンドシェイク */
|
---|
1420 | g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE; /* デスティネーションハンドシェイク */
|
---|
1421 | g_ai_hdma.Init.SrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* ソースハードウェアハンドシェイク極性 */
|
---|
1422 | g_ai_hdma.Init.DrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* デスティネーションハードウェアハンドシェイク極性 */
|
---|
1423 | g_ai_hdma.Init.Priority = 4; /* 優先度 */
|
---|
1424 | g_ai_hdma.Init.SrcMaster = DMAC_MASTER1; /* ソースマスター設定 */
|
---|
1425 | g_ai_hdma.Init.DstMaster = DMAC_MASTER2; /* デスティネーションマスター設定 */
|
---|
1426 | g_ai_hdma.Init.SrcInc = DMAC_ADDR_NOCHANGE; /* ソースインクリメント設定 */
|
---|
1427 | g_ai_hdma.Init.DstInc = DMAC_ADDR_INCREMENT; /* デスティネーションインクリメント設定 */
|
---|
1428 | g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32; /* ソース転送幅 */
|
---|
1429 | g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32; /* デスティネーション転送幅 */
|
---|
1430 | g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4; /* ソースバーストサイズ */
|
---|
1431 | g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4; /* デスティネーションバーストサイズ */
|
---|
1432 | g_ai_hdma.Init.IocBlkTrans = 0; /* IOCブロック転送 */
|
---|
1433 | g_ai_hdma.localdata = (void *)ctx;
|
---|
1434 |
|
---|
1435 | return dma_init(&g_ai_hdma);
|
---|
1436 | }
|
---|