Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

kpu.c

Last change on this file was 458, checked in by coas-nagasima, 4 years ago
SPIとSerial、KPUの動作を改善
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc;charset=UTF-8`
File size: 42.6 KB

Line
1	#include <assert.h>
2	#include <float.h>
3	#include <math.h>
4	#include <stdio.h>
5	#include <stdlib.h>
6	#include <string.h>
7	#include <stdint.h>
8	#include <kernel.h>
9	#include <t_syslog.h>
10	#include <t_stdlib.h>
11	#include <kernel_impl.h>
12	#include <target_syssvc.h>
13	#include "kendryte-k210.h"
14	#include "device.h"
15	#include "atomic.h"
16	#include "kpu.h"
17	#include "utils.h"
18	#include "kpu_main.h"
19	#include "kernel_cfg.h"
20
21	#define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) \| (b))
22
23	uint64_t sysctl_get_time_us(void)
24	{
25	uint64_t v_cycle = read_cycle();
26	return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
27	}
28
29	static int is_memory(uintptr_t address)
30	{
31	enum
32	{
33	mem_len = 6 * 1024 * 1024,
34	mem_no_cache_len = 8 * 1024 * 1024,
35	};
36	return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) \|\| ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) \|\| (address == 0x50450040);
37	}
38
39	uint32_t is_memory_cache(uintptr_t address)
40	{
41	#define MEM_CACHE_LEN (6 * 1024 * 1024)
42
43	return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
44	}
45
46	int plic_irq_enable(INTNO irq_number)
47	{
48	if (irq_number != INTNO_AI)
49	return -1;
50	ena_int(irq_number);
51	return 0;
52	}
53
54	int plic_set_priority(INTNO irq_number, uint32_t priority)
55	{
56	if (irq_number != INTNO_AI)
57	return -1;
58	set_ipriority(irq_number, priority);
59	return 0;
60	}
61
62	plic_irq_callback_t ai_done_callback;
63	void *ai_done_ctx;
64
65	void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
66	{
67	if (irq != INTNO_AI)
68	return;
69
70	dis_int(INTNO_AI);
71
72	ai_done_callback = callback;
73	ai_done_ctx = ctx;
74
75	ena_int(INTNO_AI);
76	}
77
78	void ai_done_isr(intptr_t exinf)
79	{
80	dis_int(INTNO_AI);
81	if (ai_done_callback != NULL)
82	{
83	ai_done_callback(ai_done_ctx);
84	}
85	ena_int(INTNO_AI);
86	}
87
88	plic_irq_callback_t ai_dma_done_callback;
89	void *ai_dma_done_ctx;
90
91	void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
92	plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
93	{
94	if (channel_num != AI_DMA_CH)
95	return;
96
97	//set_ipriority(INTNO_DMAAI, priority);
98
99	dis_int(INTNO_DMAAI);
100
101	ai_dma_done_callback = dmac_callback;
102	ai_dma_done_ctx = ctx;
103
104	ena_int(INTNO_DMAAI);
105	}
106
107	void ai_dma_done_isr(DMA_Handle_t *dma)
108	{
109	dis_int(INTNO_DMAAI);
110
111	if (ai_dma_done_callback != NULL)
112	{
113	ai_dma_done_callback(ai_dma_done_ctx);
114	}
115
116	ena_int(INTNO_DMAAI);
117	}
118
119	void dmac_set_irq(dmac_channel_number_t channel_num,
120	plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
121	{
122	if (channel_num != AI_DMA_CH)
123	return;
124
125	//set_ipriority(INTNO_DMAAI, priority);
126
127	dis_int(INTNO_DMAAI);
128
129	ai_dma_done_callback = dmac_callback;
130	ai_dma_done_ctx = ctx;
131
132	ena_int(INTNO_DMAAI);
133	}
134
135	DMA_Handle_t g_ai_hdma;
136
137	void dmac_set_single_mode(dmac_channel_number_t channel_num,
138	const void src, void dest, uint8_t src_inc,
139	uint8_t dest_inc,
140	uint8_t dmac_burst_size,
141	uint8_t dmac_trans_width,
142	size_t block_size)
143	{
144	if (channel_num != AI_DMA_CH)
145	return;
146
147	DMA_Handle_t *hdma = &g_ai_hdma;
148	int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
149	uint8_t flow_control;
150	if (mem_type_src == 0 && mem_type_dest == 0)
151	flow_control = DMA_PERIPH_TO_PERIPH;
152	else if (mem_type_src == 1 && mem_type_dest == 0)
153	flow_control = DMA_MEMORY_TO_PERIPH;
154	else if (mem_type_src == 0 && mem_type_dest == 1)
155	flow_control = DMA_PERIPH_TO_MEMORY;
156	else
157	flow_control = DMA_MEMORY_TO_MEMORY;
158
159	hdma->Init.Direction = flow_control; /* DMA転送方向 */
160	hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
161	hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
162	hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
163	hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
164	hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
165	hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
166	hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
167	hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
168	dma_reset(hdma);
169
170	dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
171	}
172
173	#define LAYER_BURST_SIZE 12
174
175	#define KPU_DEBUG 0
176	#define USE_CACHED_AI_RAM 0
177
178	#define min(a, b) (((a) < (b)) ? (a) : (b))
179	#define max(a, b) (((a) > (b)) ? (a) : (b))
180	#define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
181
182	static int ai_step(void *userdata);
183	static int kpu_kmodel_done(kpu_model_context_t *ctx);
184
185	volatile kpu_config_t const kpu = (volatile kpu_config_t )AI_BASE_ADDR;
186	static volatile uint32_t kpu_status;
187
188	static void kpu_send_layer(const kpu_layer_argument_t *layer)
189	{
190	kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
191	kpu->layer_argument_fifo = layer->image_addr.reg;
192	kpu->layer_argument_fifo = layer->image_channel_num.reg;
193	kpu->layer_argument_fifo = layer->image_size.reg;
194	kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
195	kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
196	kpu->layer_argument_fifo = layer->kernel_offset.reg;
197	kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
198	kpu->layer_argument_fifo = layer->write_back_cfg.reg;
199	kpu->layer_argument_fifo = layer->conv_value.reg;
200	kpu->layer_argument_fifo = layer->conv_value2.reg;
201	kpu->layer_argument_fifo = layer->dma_parameter.reg;
202	}
203
204	void kpu_input_dma(const kpu_layer_argument_t layer, const uint8_t src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
205	{
206	uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
207	dmac_set_irq(dma_ch, callback, userdata, 1);
208	dmac_set_single_mode(dma_ch, (void )src, (void )(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
209	DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
210	}
211
212	static void kpu_conv2d_core(kpu_layer_argument_t *layer)
213	{
214	kpu_send_layer(layer);
215	}
216
217	void kpu_conv2d(kpu_layer_argument_t *layer)
218	{
219	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
220	.calc_done_int = 1,
221	.layer_cfg_almost_empty_int = 1,
222	.layer_cfg_almost_full_int = 1};
223	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
224	.calc_done_int = 1,
225	.layer_cfg_almost_empty_int = 0,
226	.layer_cfg_almost_full_int = 1};
227	kpu_conv2d_core(layer);
228	}
229
230	void kpu_global_average_pool(const uint8_t src, const quantize_param_t src_param, int kernel_size, int channels, uint8_t dest, const quantize_param_t dest_param)
231	{
232	quantize_param_t q1 = src_param, q2 = dest_param;
233	size_t oc, y, x;
234
235	if (((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
236	{
237	uint32_t row_padding = 16;
238	uint32_t row_group = 4;
239	uint32_t row_length = 1;
240	uint32_t height = 4;
241
242	for (oc = 0; oc < channels; oc++)
243	{
244	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
245	for (y = 0; y < 1; y++)
246	{
247	uint8_t y_origin = channel_origin + y row_length * 64;
248	for (x = 0; x < 1; x++)
249	{
250	int64_t sum = 0;
251	size_t i;
252	for (i = 0; i < kernel_size; i++)
253	sum += *src++;
254
255	int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
256	if (value < 0)
257	value = 0;
258	if (value > 0xFF)
259	value = 0xFF;
260	y_origin[x] = value;
261	}
262	}
263	}
264	}
265	else
266	{
267	for (oc = 0; oc < channels; oc++)
268	{
269	int64_t sum = 0;
270	size_t i;
271	for (i = 0; i < kernel_size; i++)
272	sum += *src++;
273
274	int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
275	if (value < 0)
276	value = 0;
277	if (value > 0xFF)
278	value = 0xFF;
279	dest[oc] = value;
280	}
281	}
282	}
283
284	void kpu_global_average_pool_float(const uint8_t src, const quantize_param_t src_param, int kernel_size, int channels, float *dest)
285	{
286	quantize_param_t q = *src_param;
287	size_t oc;
288
289	for (oc = 0; oc < channels; oc++)
290	{
291	int64_t sum = 0;
292	size_t i;
293	for (i = 0; i < kernel_size; i++)
294	sum += *src++;
295
296	float value = (sum * q.scale + q.bias) / kernel_size;
297	dest[oc] = value;
298	}
299	}
300
301	#if USE_CACHED_AI_RAM
302	static void kpu_flush_cache(uint32_t addr, size_t lines)
303	{
304	size_t line;
305	for (line = 0; line < lines; line++)
306	{
307	const uint64_t src = (const uint64_t )(AI_RAM_BASE_ADDR + (addr + line) * 64);
308	uint64_t dest = (uint64_t )(AI_IO_BASE_ADDR + (addr + line) * 64);
309	size_t i;
310	for (i = 0; i < 8; i++)
311	dest[i] = src[i];
312	}
313	}
314	#endif
315	static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
316	{
317	if (shift > 0)
318	{
319	value >>= shift - 1;
320	if (value & 0x1)
321	{
322	if (value < 0)
323	value = (value >> 1) - 1;
324	else
325	value = (value >> 1) + 1;
326	}
327	else
328	{
329	value >>= 1;
330	}
331	}
332
333	return value;
334	}
335	static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
336	{
337	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
338	size_t oc, y, x;
339	uint32_t row_padding;
340	uint32_t row_group;
341	uint32_t row_length;
342	if (width <= 16)
343	{
344	row_padding = 16;
345	row_group = 4;
346	row_length = 1;
347	}
348	else if (width <= 32)
349	{
350	row_padding = 32;
351	row_group = 2;
352	row_length = 1;
353	}
354	else
355	{
356	row_padding = 64;
357	row_group = 1;
358	row_length = (width + 63) / 64;
359	}
360
361	if ((uintptr_t)src % 8 == 0 && width % 8 == 0)
362	{
363	#define UPLOAD_BEGIN() \
364	for (oc = 0; oc < channels; oc++) \
365	{ \
366	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding; \
367	for (y = 0; y < height; y++) \
368	{ \
369	uint64_t y_origin = (uint64_t )(channel_origin + y * row_length * 64);
370
371	#define UPLOAD_END() \
372	} \
373	}
374
375	width /= 8;
376	const uint64_t u64_src = (const uint64_t )src;
377	if (width == 1)
378	{
379	UPLOAD_BEGIN()
380	y_origin[0] = *u64_src++;
381	UPLOAD_END()
382	}
383	else if (width == 2)
384	{
385	UPLOAD_BEGIN()
386	{
387	y_origin[0] = *u64_src++;
388	y_origin[1] = *u64_src++;
389	}
390	UPLOAD_END()
391	}
392	else if (width == 4)
393	{
394	UPLOAD_BEGIN()
395	{
396	y_origin[0] = *u64_src++;
397	y_origin[1] = *u64_src++;
398	y_origin[2] = *u64_src++;
399	y_origin[3] = *u64_src++;
400	}
401	UPLOAD_END()
402	}
403	else
404	{
405	UPLOAD_BEGIN()
406	for (x = 0; x < width; x++)
407	y_origin[x] = *u64_src++;
408	UPLOAD_END()
409	}
410	}
411	else
412	{
413	for (oc = 0; oc < channels; oc++)
414	{
415	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
416	for (y = 0; y < height; y++)
417	{
418	uint8_t y_origin = channel_origin + y row_length * 64;
419	for (x = 0; x < width; x++)
420	y_origin[x] = *src++;
421	}
422	}
423	}
424	}
425	static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t layer, const uint8_t src)
426	{
427	size_t width = layer->image_size.data.i_row_wid + 1;
428	size_t height = layer->image_size.data.i_col_high + 1;
429	size_t channels = layer->image_channel_num.data.i_ch_num + 1;
430
431	kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
432	}
433
434	static void kpu_kmodel_input_float(const float src, float dest, size_t count)
435	{
436	memcpy(dest, src, count * sizeof(float));
437	}
438
439	static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
440	{
441	size_t i;
442
443	if (act == KLA_RELU)
444	{
445	for (i = 0; i < count; i++)
446	data[i] = max(data[i], 0);
447	}
448	else if (act == KLA_RELU6)
449	{
450	for (i = 0; i < count; i++)
451	data[i] = min(max(data[i], 0), 6);
452	}
453	}
454
455	static void kpu_kmodel_add(const kpu_model_add_layer_argument_t arg, kpu_model_context_t ctx)
456	{
457	const float src_a = (const float )(ctx->main_buffer + arg->main_mem_in_a_address);
458	const float src_b = (const float )(ctx->main_buffer + arg->main_mem_in_b_address);
459	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
460	size_t i, count = arg->count;
461
462	for (i = 0; i < count; i++)
463	dest[i] = src_a[i] + src_b[i];
464	}
465
466	static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t arg, kpu_model_context_t ctx)
467	{
468	const uint8_t src_a = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_a_address);
469	const uint8_t src_b = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_b_address);
470	size_t count = ALIGN_UP(arg->count, 8) / 8;
471	int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
472	int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
473	int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
474
475	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
476	size_t i;
477
478	if (sh_a == sh_b)
479	{
480	#define QADD_UNROLL_1(x) \
481	int64_t a##x = *src_a++; \
482	int64_t b##x = *src_b++;
483
484	#define QADD_UNROLL_2(x) \
485	a##x += off_a; \
486	b##x += off_b;
487
488	#define QADD_UNROLL_3(x) \
489	a##x *= mul_a; \
490	b##x *= mul_b;
491
492	#define QADD_UNROLL_4(x) \
493	int64_t v##x = a##x + b##x;
494
495	#define QADD_UNROLL_5(x) \
496	v##x >>= sh_a;
497
498	#define QADD_UNROLL_6(x) \
499	v##x *= mul_o;
500
501	#define QADD_UNROLL_7(x) \
502	v##x = kpu_carry_shift(v##x, sh_o);
503
504	#define QADD_UNROLL_8(x) \
505	v##x += off_o;
506
507	#define QADD_UNROLL_9(x) \
508	v##x = min(0xFF, max(0, v##x));
509
510	#define QADD_UNROLL_10(x) \
511	*dest++ = v##x;
512
513	#define QADD_UNROLL_S(x) \
514	QADD_UNROLL_##x(0) \
515	QADD_UNROLL_##x(1) \
516	QADD_UNROLL_##x(2) \
517	QADD_UNROLL_##x(3) \
518	QADD_UNROLL_##x(4) \
519	QADD_UNROLL_##x(5) \
520	QADD_UNROLL_##x(6) \
521	QADD_UNROLL_##x(7)
522
523	for (i = 0; i < count; i++)
524	{
525	QADD_UNROLL_S(1);
526	QADD_UNROLL_S(2);
527	QADD_UNROLL_S(3);
528	QADD_UNROLL_S(4);
529	QADD_UNROLL_S(5);
530	QADD_UNROLL_S(6);
531	QADD_UNROLL_S(7);
532	QADD_UNROLL_S(8);
533	QADD_UNROLL_S(9);
534	QADD_UNROLL_S(10);
535	}
536	}
537	else
538	{
539	#undef QADD_UNROLL_1
540	#define QADD_UNROLL_1(x) \
541	int64_t a##x = *src_a++; \
542	int64_t b##x = *src_b++;
543
544	#undef QADD_UNROLL_2
545	#define QADD_UNROLL_2(x) \
546	a##x += off_a; \
547	b##x += off_b;
548
549	#undef QADD_UNROLL_3
550	#define QADD_UNROLL_3(x) \
551	a##x *= mul_a; \
552	b##x *= mul_b;
553
554	#undef QADD_UNROLL_4
555	#define QADD_UNROLL_4(x) \
556	a##x >>= sh_a; \
557	b##x >>= sh_b;
558
559	#undef QADD_UNROLL_5
560	#define QADD_UNROLL_5(x) \
561	int64_t v##x = a##x + b##x;
562
563	#undef QADD_UNROLL_6
564	#define QADD_UNROLL_6(x) \
565	v##x *= mul_o;
566
567	#undef QADD_UNROLL_7
568	#define QADD_UNROLL_7(x) \
569	v##x = kpu_carry_shift(v##x, sh_o);
570
571	#undef QADD_UNROLL_8
572	#define QADD_UNROLL_8(x) \
573	v##x += off_o;
574
575	#undef QADD_UNROLL_9
576	#define QADD_UNROLL_9(x) \
577	v##x = min(0xFF, max(0, v##x));
578
579	#undef QADD_UNROLL_10
580	#define QADD_UNROLL_10(x) \
581	*dest++ = v##x;
582
583	#undef QADD_UNROLL_S
584	#define QADD_UNROLL_S(x) \
585	QADD_UNROLL_##x(0) \
586	QADD_UNROLL_##x(1) \
587	QADD_UNROLL_##x(2) \
588	QADD_UNROLL_##x(3) \
589	QADD_UNROLL_##x(4) \
590	QADD_UNROLL_##x(5) \
591	QADD_UNROLL_##x(6) \
592	QADD_UNROLL_##x(7)
593
594	for (i = 0; i < count; i++)
595	{
596	QADD_UNROLL_S(1);
597	QADD_UNROLL_S(2);
598	QADD_UNROLL_S(3);
599	QADD_UNROLL_S(4);
600	QADD_UNROLL_S(5);
601	QADD_UNROLL_S(6);
602	QADD_UNROLL_S(7);
603	QADD_UNROLL_S(8);
604	QADD_UNROLL_S(9);
605	QADD_UNROLL_S(10);
606	}
607	}
608	}
609
610	static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t arg, kpu_model_context_t ctx)
611	{
612	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
613	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
614	size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
615
616	for (oc = 0; oc < channels; oc++)
617	{
618	float sum = 0.f;
619	size_t i;
620	for (i = 0; i < kernel_size; i++)
621	sum += *src++;
622
623	dest[oc] = sum / kernel_size;
624	}
625	}
626
627	static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t arg, kpu_model_context_t ctx)
628	{
629	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
630	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
631	kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
632	uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
633	uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
634	uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
635
636	uint32_t out_y, out_x, oc;
637
638	for (oc = 0; oc < out_shape.channels; oc++)
639	{
640	const uint8_t channel_src = src + in_shape.width in_shape.height * oc;
641	for (out_y = 0; out_y < out_shape.height; out_y++)
642	{
643	for (out_x = 0; out_x < out_shape.width; out_x++)
644	{
645	int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
646	int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
647	int32_t kernel_x_start = max(0, -in_x_origin);
648	int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
649	int32_t kernel_y_start = max(0, -in_y_origin);
650	int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
651	uint8_t value = 0;
652
653	int32_t kernel_y, kernel_x;
654	for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
655	{
656	for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
657	{
658	int32_t in_x = in_x_origin + kernel_x;
659	int32_t in_y = in_y_origin + kernel_y;
660	value = max(value, channel_src[in_y * in_shape.width + in_x]);
661	}
662	}
663
664	*dest++ = value;
665	}
666	}
667	}
668	}
669
670	static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t arg, kpu_model_context_t ctx)
671	{
672	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
673	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
674	kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
675	uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
676	uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
677	uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
678
679	uint32_t out_y, out_x, oc;
680
681	for (oc = 0; oc < out_shape.channels; oc++)
682	{
683	const float channel_src = src + in_shape.width in_shape.height * oc;
684	for (out_y = 0; out_y < out_shape.height; out_y++)
685	{
686	for (out_x = 0; out_x < out_shape.width; out_x++)
687	{
688	int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
689	int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
690	int32_t kernel_x_start = max(0, -in_x_origin);
691	int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
692	int32_t kernel_y_start = max(0, -in_y_origin);
693	int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
694	float value = 0;
695	float kernel_count = 0;
696
697	int32_t kernel_y, kernel_x;
698	for (kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
699	{
700	for (kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
701	{
702	int32_t in_x = in_x_origin + kernel_x;
703	int32_t in_y = in_y_origin + kernel_y;
704	value += channel_src[in_y * in_shape.width + in_x];
705	kernel_count++;
706	}
707	}
708
709	*dest++ = value / kernel_count;
710	}
711	}
712	}
713	}
714
715	static void kpu_quantize(const kpu_model_quantize_layer_argument_t arg, kpu_model_context_t ctx)
716	{
717	size_t count = arg->count;
718	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
719
720	kpu_model_quant_param_t q = arg->quant_param;
721
722	float scale = 1.f / q.scale;
723
724	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->mem_out_address);
725	size_t i;
726	for (i = 0; i < count; i++)
727	{
728	int value = roundf((src++ - q.bias) scale);
729	if (value < 0)
730	value = 0;
731	if (value > 0xFF)
732	value = 0xFF;
733	*dest++ = (uint8_t)value;
734	}
735	}
736
737	static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t arg, kpu_model_context_t ctx)
738	{
739	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
740	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
741	size_t oc, count = arg->count;
742	kpu_model_quant_param_t q = arg->quant_param;
743
744	for (oc = 0; oc < count; oc++)
745	dest[oc] = src++ q.scale + q.bias;
746	}
747
748	static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t arg, kpu_model_context_t ctx)
749	{
750	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
751	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
752	size_t oc, i, channels = arg->channels, count = arg->channel_size;
753
754	for (oc = 0; oc < channels; oc++)
755	{
756	const kpu_model_quant_param_t q = arg->quant_params[oc];
757
758	for (i = 0; i < count; i++)
759	dest++ = src++ * q.scale + q.bias;
760	}
761	}
762
763	static void kpu_requantize(const kpu_model_requantize_layer_argument_t arg, kpu_model_context_t ctx)
764	{
765	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
766	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
767	size_t oc, count = arg->count;
768	const uint8_t *table = arg->table;
769
770	if (false && count % 8 == 0)
771	{
772	for (oc = 0; oc < count;)
773	{
774	dest[oc++] = table[*src++];
775	dest[oc++] = table[*src++];
776	dest[oc++] = table[*src++];
777	dest[oc++] = table[*src++];
778	dest[oc++] = table[*src++];
779	dest[oc++] = table[*src++];
780	dest[oc++] = table[*src++];
781	dest[oc++] = table[*src++];
782	}
783	}
784	else
785	{
786	for (oc = 0; oc < count; oc++)
787	dest[oc] = table[src[oc]];
788	}
789	}
790
791	static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t arg, kpu_model_context_t ctx)
792	{
793	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
794	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
795	size_t oc, channels = arg->channels;
796
797	float sum = 0.f;
798	const float epsilon = 1e-10f;
799	for (oc = 0; oc < channels; oc++)
800	sum += src[oc] * src[oc];
801	if (sum < epsilon)
802	sum = epsilon;
803	sum = 1.f / sqrtf(sum);
804	for (oc = 0; oc < channels; oc++)
805	dest[oc] = src[oc] * sum;
806	}
807
808	static void kpu_softmax(const kpu_model_softmax_layer_argument_t arg, kpu_model_context_t ctx)
809	{
810	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
811	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
812	size_t oc, channels = arg->channels;
813
814	float max = FLT_MIN;
815	for (oc = 0; oc < channels; oc++)
816	max = fmaxf(max, src[oc]);
817
818	float sum = 0.f;
819	for (oc = 0; oc < channels; oc++)
820	{
821	float value = expf(src[oc] - max);
822	sum += value;
823	dest[oc] = value;
824	}
825
826	for (oc = 0; oc < channels; oc++)
827	dest[oc] /= sum;
828	}
829
830	static void kpu_concat(const kpu_model_concat_layer_argument_t arg, kpu_model_context_t ctx)
831	{
832	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
833	uint32_t count = arg->input_count, i;
834
835	for (i = 0; i < count; i++)
836	{
837	kpu_model_memory_range_t input = arg->inputs_mem[i];
838	const uint8_t src = (const uint8_t )(ctx->main_buffer + input.start);
839	memcpy(dest, src, input.size);
840	dest += input.size;
841	}
842	}
843
844	static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t arg, kpu_model_context_t ctx)
845	{
846	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
847	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
848	uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
849	float weights = (float )malloc(in_channels * out_channels * sizeof(float));
850	float bias = (float )malloc(out_channels * sizeof(float));
851	memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
852	memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
853
854	if (in_channels % 8 == 0)
855	{
856	#define FC_UNROLL_1(x) \
857	float i##x = *c_src++; \
858	float w##x = *c_weights++;
859
860	#define FC_UNROLL_2(x) \
861	sum += i##x * w##x;
862
863	#define FC_UNROLL_S(x) \
864	FC_UNROLL_##x(0) \
865	FC_UNROLL_##x(1) \
866	FC_UNROLL_##x(2) \
867	FC_UNROLL_##x(3) \
868	FC_UNROLL_##x(4) \
869	FC_UNROLL_##x(5) \
870	FC_UNROLL_##x(6) \
871	FC_UNROLL_##x(7)
872
873	for (oc = 0; oc < out_channels; oc++)
874	{
875	const float *c_src = src;
876	const float c_weights = weights + oc in_channels;
877
878	float sum = 0.0f;
879	for (ic = 0; ic < in_channels / 8; ic++)
880	{
881	FC_UNROLL_S(1);
882	FC_UNROLL_S(2);
883	}
884
885	dest[oc] = sum + bias[oc];
886	}
887	}
888	else
889	{
890	for (oc = 0; oc < out_channels; oc++)
891	{
892	const float c_weights = weights + oc in_channels;
893
894	float sum = 0.0f;
895	for (ic = 0; ic < in_channels; ic++)
896	sum += src[ic] * c_weights[ic];
897	dest[oc] = sum + bias[oc];
898	}
899	}
900	free(weights);
901	free(bias);
902	kpu_float_activation(dest, out_channels, arg->act);
903	}
904
905	static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t arg, kpu_model_context_t ctx)
906	{
907	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
908	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
909	kpu_model_shape_t in_shape = arg->shape;
910	uint32_t oc, oy, ox;
911
912	for (oy = 0; oy < in_shape.height; oy++)
913	for (ox = 0; ox < in_shape.width; ox++)
914	for (oc = 0; oc < in_shape.channels; oc++)
915	dest++ = src[(oc in_shape.height + oy) * in_shape.width + ox];
916	}
917
918	static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t arg, kpu_model_context_t ctx)
919	{
920	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
921	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
922	kpu_model_shape_t in_shape = arg->in_shape;
923	uint32_t out_width = arg->out_width, out_height = arg->out_height;
924	uint32_t oc, oy, ox;
925
926	float height_scale = (float)in_shape.height / out_height;
927	float width_scale = (float)in_shape.width / out_width;
928
929	for (oc = 0; oc < in_shape.channels; oc++)
930	{
931	const float channel_src = src + in_shape.width in_shape.height * oc;
932	for (oy = 0; oy < out_height; oy++)
933	{
934	uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
935	const float y_origin = channel_src + in_y in_shape.width;
936	for (ox = 0; ox < out_width; ox++)
937	{
938	uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
939	*dest++ = y_origin[in_x];
940	}
941	}
942	}
943	}
944
945	static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t arg, kpu_model_context_t ctx)
946	{
947	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
948	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
949	kpu_model_shape_t in_shape = arg->in_shape;
950	uint32_t out_width = arg->out_width, out_height = arg->out_height;
951	uint32_t oc, oy, ox;
952
953	float height_scale = (float)in_shape.height / out_height;
954	float width_scale = (float)in_shape.width / out_width;
955
956	for (oc = 0; oc < in_shape.channels; oc++)
957	{
958	const uint8_t channel_src = src + in_shape.width in_shape.height * oc;
959	for (oy = 0; oy < out_height; oy++)
960	{
961	uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
962	const uint8_t y_origin = channel_src + in_y in_shape.width;
963	for (ox = 0; ox < out_width; ox++)
964	{
965	uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
966	*dest++ = y_origin[in_x];
967	}
968	}
969	}
970	}
971
972	static void kpu_logistic(const kpu_model_logistic_layer_argument_t arg, kpu_model_context_t ctx)
973	{
974	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
975	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
976	size_t oc, channels = arg->channels;
977
978	for (oc = 0; oc < channels; oc++)
979	dest[oc] = 1.f / (1.f + expf(-src[oc]));
980	}
981
982	static void kpu_conv(const kpu_model_conv_layer_argument_t arg, kpu_model_context_t ctx)
983	{
984	volatile kpu_layer_argument_t layer = (const volatile kpu_layer_argument_t )(ctx->model_buffer + arg->layer_offset);
985	layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
986	layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
987	layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
988
989	if (arg->flags & KLF_MAIN_MEM_OUT)
990	{
991	dmac_channel_number_t dma_ch = ctx->dma_ch;
992	uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
993	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
994	.calc_done_int = 1,
995	.layer_cfg_almost_empty_int = 1,
996	.layer_cfg_almost_full_int = 1};
997	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
998	.calc_done_int = 1,
999	.layer_cfg_almost_empty_int = 1,
1000	.layer_cfg_almost_full_int = 1};
1001	layer.dma_parameter.data.send_data_out = 1;
1002	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
1003	if (ctx->current_layer < ctx->layers_length)
1004	dmac_set_irq(dma_ch, ai_step, ctx, 1);
1005	else
1006	dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
1007	dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
1008	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
1009	}
1010	else
1011	{
1012	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1013	.calc_done_int = 1,
1014	.layer_cfg_almost_empty_int = 1,
1015	.layer_cfg_almost_full_int = 1};
1016
1017	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1018	.calc_done_int = 0,
1019	.layer_cfg_almost_empty_int = 1,
1020	.layer_cfg_almost_full_int = 1};
1021	layer.interrupt_enabe.data.int_en = 1;
1022	}
1023
1024	kpu_send_layer((const kpu_layer_argument_t *)&layer);
1025	}
1026
1027	static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t arg, kpu_model_context_t ctx)
1028	{
1029	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1030	#if USE_CACHED_AI_RAM
1031	uint8_t dest = (uint8_t )(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
1032	#else
1033	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
1034	#endif
1035
1036	uint32_t row_padding = 16;
1037	uint32_t row_group = 4;
1038	uint32_t row_length = 1;
1039	uint32_t height = 4;
1040	uint32_t oc, x, y, channels = arg->channels;
1041
1042	for (oc = 0; oc < channels; oc++)
1043	{
1044	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
1045	for (y = 0; y < 1; y++)
1046	{
1047	uint8_t y_origin = channel_origin + y row_length * 64;
1048	for (x = 0; x < 1; x++)
1049	y_origin[x] = *src++;
1050	}
1051	}
1052
1053	#if USE_CACHED_AI_RAM
1054	uint32_t lines = row_length * height * channels / row_group;
1055	kpu_flush_cache(arg->kpu_mem_out_address, lines);
1056	#endif
1057	}
1058
1059	static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t arg, kpu_model_context_t ctx)
1060	{
1061	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1062	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
1063	uint32_t oc, channels = arg->channels;
1064
1065	for (oc = 0; oc < channels; oc++)
1066	dest++ = src[oc 16];
1067	}
1068
1069	static void kpu_upload(const kpu_model_upload_layer_argument_t arg, kpu_model_context_t ctx)
1070	{
1071	size_t width = arg->width;
1072	size_t height = arg->height;
1073	size_t channels = arg->channels;
1074
1075	kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
1076	}
1077
1078	int kpu_load_kmodel(kpu_model_context_t ctx, const uint8_t buffer)
1079	{
1080	#if FIX_CACHE
1081	configASSERT(is_memory_cache((uintptr_t)buffer));
1082	#endif
1083	uintptr_t base_addr = (uintptr_t)buffer;
1084	const kpu_kmodel_header_t header = (const kpu_kmodel_header_t )buffer;
1085
1086	if (header->version == 3 && header->arch == 0)
1087	{
1088	ctx->model_buffer = buffer;
1089	ctx->output_count = header->output_count;
1090	ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
1091	ctx->layer_headers = (const kpu_model_layer_header_t )((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) ctx->output_count);
1092	ctx->layers_length = header->layers_length;
1093	ctx->body_start = (const uint8_t )((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) header->layers_length);
1094	ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
1095	if (!ctx->main_buffer)
1096	return -1;
1097	uint32_t body_size = 0;
1098	for (int i = 0; i < ctx->layers_length; i++)
1099	{
1100	const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
1101	body_size += cnt_layer_header->body_size;
1102	}
1103	uint8_t body_start_iomem = (uint8_t )((uintptr_t)ctx->body_start - IOMEM);
1104	const uint8_t *body_start_cache = ctx->body_start;
1105	memcpy(body_start_iomem, body_start_cache, body_size);
1106	for (int i = 0; i < body_size; i++)
1107	{
1108	configASSERT(body_start_iomem[i] == body_start_cache[i]);
1109	}
1110	}
1111	else
1112	{
1113	return -1;
1114	}
1115
1116	return 0;
1117	}
1118
1119	int kpu_get_output(kpu_model_context_t ctx, uint32_t index, uint8_t data, size_t size)
1120	{
1121	if (index >= ctx->output_count)
1122	return -1;
1123
1124	const kpu_model_output_t *output = ctx->outputs + index;
1125	*data = ctx->main_buffer + output->address;
1126	*size = output->size;
1127	return 0;
1128	}
1129
1130	void kpu_model_free(kpu_model_context_t *ctx)
1131	{
1132	free(ctx->main_buffer);
1133	ctx->main_buffer = NULL;
1134	}
1135
1136	#if KPU_DEBUG
1137	static uint64_t last_time;
1138	static uint64_t total_time;
1139	static uint64_t kpu_time;
1140	static uint32_t last_layer_type;
1141
1142	static const char *str_layer_type(uint32_t type)
1143	{
1144	switch (type)
1145	{
1146	case KL_ADD:
1147	return "Add";
1148	case KL_QUANTIZED_ADD:
1149	return "QuantAdd";
1150	case KL_GLOBAL_AVERAGE_POOL2D:
1151	return "GAP";
1152	case KL_QUANTIZED_MAX_POOL2D:
1153	return "QuantMaxPool2d";
1154	case KL_AVERAGE_POOL2D:
1155	return "AveragePool2d";
1156	case KL_QUANTIZE:
1157	return "Quantize";
1158	case KL_DEQUANTIZE:
1159	return "Dequantize";
1160	case KL_REQUANTIZE:
1161	return "Requantize";
1162	case KL_L2_NORMALIZATION:
1163	return "L2Norm";
1164	case KL_SOFTMAX:
1165	return "Softmax";
1166	case KL_CONCAT:
1167	return "Concat";
1168	case KL_QUANTIZED_CONCAT:
1169	return "QuantConcat";
1170	case KL_FULLY_CONNECTED:
1171	return "FullyConnected";
1172	case KL_TENSORFLOW_FLATTEN:
1173	return "TFFlatten";
1174	case KL_RESIZE_NEAREST_NEIGHBOR:
1175	return "ResizeNearestNeighbor";
1176	case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
1177	return "QuantResizeNearestNeighbor";
1178	case KL_CHANNELWISE_DEQUANTIZE:
1179	return "ChannelwiseDequantize";
1180	case KL_LOGISTIC:
1181	return "Logistic";
1182	case KL_K210_CONV:
1183	return "K210Conv";
1184	case KL_K210_ADD_PADDING:
1185	return "K210AddPad";
1186	case KL_K210_REMOVE_PADDING:
1187	return "K210RemovePad";
1188	case KL_K210_UPLOAD:
1189	return "K210Upload";
1190	default:
1191	return "Unknown";
1192	}
1193	}
1194	#endif
1195
1196	static int kpu_kmodel_done(kpu_model_context_t *ctx)
1197	{
1198	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1199	.calc_done_int = 1,
1200	.layer_cfg_almost_empty_int = 1,
1201	.layer_cfg_almost_full_int = 1};
1202	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1203	.calc_done_int = 1,
1204	.layer_cfg_almost_empty_int = 1,
1205	.layer_cfg_almost_full_int = 1};
1206	#if KPU_DEBUG
1207	uint32_t cnt_layer_id = ctx->current_layer;
1208	uint64_t time = sysctl_get_time_us();
1209	if (last_time != 0)
1210	{
1211	uint64_t layer_time = time - last_time;
1212	syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
1213	total_time += layer_time;
1214	if (last_layer_type == KL_K210_CONV)
1215	kpu_time += layer_time;
1216	}
1217
1218	syslog(LOG_NOTICE, "KPU: %d.%03d ms", kpu_time / 1000, kpu_time % 1000);
1219	syslog(LOG_NOTICE, "CPU: %d.%03d ms", (total_time - kpu_time) / 1000, (total_time - kpu_time) % 1000);
1220	syslog(LOG_NOTICE, "Model: %d.%03d ms", total_time / 1000, total_time % 1000);
1221	#endif
1222	ctx->done_callback(ctx->userdata);
1223	return 0;
1224	}
1225
1226	static int ai_step(void *userdata)
1227	{
1228	kpu_model_context_t ctx = (kpu_model_context_t )userdata;
1229
1230	uint32_t cnt_layer_id = ctx->current_layer;
1231	const uint8_t *layer_body = ctx->current_body;
1232	const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
1233	if (cnt_layer_id >= ctx->layers_length)
1234	{
1235	//syslog(LOG_NOTICE, "overrun");
1236	kpu_kmodel_done(ctx);
1237	return -1;
1238	}
1239
1240	ctx->current_layer++;
1241	ctx->current_body += cnt_layer_header->body_size;
1242
1243	#if KPU_DEBUG
1244	uint64_t time = sysctl_get_time_us();
1245	if (last_time != 0)
1246	{
1247	uint64_t layer_time = time - last_time;
1248	syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
1249	total_time += layer_time;
1250	if (last_layer_type == KL_K210_CONV)
1251	kpu_time += layer_time;
1252	}
1253
1254	last_layer_type = cnt_layer_header->type;
1255	last_time = sysctl_get_time_us();
1256	#endif
1257
1258	switch (cnt_layer_header->type)
1259	{
1260	case KL_ADD:
1261	kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
1262	break;
1263	case KL_QUANTIZED_ADD:
1264	kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
1265	break;
1266	case KL_GLOBAL_AVERAGE_POOL2D:
1267	kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
1268	break;
1269	case KL_QUANTIZED_MAX_POOL2D:
1270	kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
1271	break;
1272	case KL_AVERAGE_POOL2D:
1273	kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
1274	break;
1275	case KL_QUANTIZE:
1276	kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
1277	break;
1278	case KL_DEQUANTIZE:
1279	kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
1280	break;
1281	case KL_REQUANTIZE:
1282	kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
1283	break;
1284	case KL_L2_NORMALIZATION:
1285	kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
1286	break;
1287	case KL_SOFTMAX:
1288	kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
1289	break;
1290	case KL_CONCAT:
1291	case KL_QUANTIZED_CONCAT:
1292	kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
1293	break;
1294	case KL_FULLY_CONNECTED:
1295	kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
1296	break;
1297	case KL_TENSORFLOW_FLATTEN:
1298	kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
1299	break;
1300	case KL_RESIZE_NEAREST_NEIGHBOR:
1301	kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
1302	break;
1303	case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
1304	kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
1305	break;
1306	case KL_CHANNELWISE_DEQUANTIZE:
1307	kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
1308	break;
1309	case KL_LOGISTIC:
1310	kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
1311	break;
1312	case KL_K210_CONV:
1313	kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
1314	return 0;
1315	case KL_K210_ADD_PADDING:
1316	kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
1317	break;
1318	case KL_K210_REMOVE_PADDING:
1319	kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
1320	break;
1321	case KL_K210_UPLOAD:
1322	kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
1323	break;
1324	default:
1325	assert(!"Layer is not supported.");
1326	kpu_kmodel_done(ctx);
1327	return -1;
1328	}
1329
1330	if (ctx->current_layer < (ctx->layers_length - 1))
1331	ai_step(userdata);
1332	else
1333	kpu_kmodel_done(ctx);
1334	return 0;
1335	}
1336
1337	static void ai_step_not_isr(void *userdata)
1338	{
1339	dis_int(INTNO_DMAAI);
1340	dis_int(INTNO_AI);
1341
1342	ai_step(userdata);
1343
1344	ena_int(INTNO_DMAAI);
1345	ena_int(INTNO_AI);
1346	}
1347
1348	int kpu_run_kmodel(kpu_model_context_t ctx, const uint8_t src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
1349	{
1350	ctx->dma_ch = dma_ch;
1351	ctx->done_callback = done_callback;
1352	ctx->userdata = userdata;
1353	ctx->current_layer = 0;
1354	ctx->current_body = ctx->body_start;
1355	#if KPU_DEBUG
1356	last_time = 0;
1357	total_time = 0;
1358	kpu_time = 0;
1359	#endif
1360
1361	kpu_kmodel_header_t header = (kpu_kmodel_header_t )ctx->model_buffer;
1362	kpu->interrupt_clear.reg = 7;
1363	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
1364	.fifo_full_threshold = 10, .fifo_empty_threshold = 1};
1365	kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
1366	.eight_bit_mode = header->flags & 1};
1367	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1368	.calc_done_int = 1,
1369	.layer_cfg_almost_empty_int = 0,
1370	.layer_cfg_almost_full_int = 1};
1371
1372	//plic_set_priority(INTNO_AI, 1);
1373	plic_irq_register(INTNO_AI, ai_step, ctx);
1374	plic_irq_enable(INTNO_AI);
1375
1376	const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
1377
1378	switch (first_layer_header->type)
1379	{
1380	case KL_K210_CONV:
1381	{
1382	const kpu_model_conv_layer_argument_t first_layer = (const kpu_model_conv_layer_argument_t )ctx->body_start;
1383	kpu_layer_argument_t layer_arg = (volatile kpu_layer_argument_t )(ctx->model_buffer + first_layer->layer_offset);
1384
1385	if ((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
1386	{
1387	kpu_kmodel_input_with_padding(&layer_arg, src);
1388	ai_step_not_isr(ctx);
1389	}
1390	else
1391	{
1392	kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
1393	}
1394	}
1395	break;
1396	case KL_FULLY_CONNECTED:
1397	{
1398	const kpu_model_fully_connected_layer_argument_t first_layer = (const kpu_model_fully_connected_layer_argument_t )ctx->body_start;
1399	kpu_kmodel_input_float((const float )src, (float )(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
1400	ai_step_not_isr(ctx);
1401	}
1402	break;
1403	default:
1404	return -1;
1405	}
1406
1407	return 0;
1408	}
1409
1410	ER kpu_init(kpu_model_context_t *ctx)
1411	{
1412	g_ai_hdma.chnum = AI_DMA_CH;
1413	g_ai_hdma.xfercallback = ai_dma_done_isr;
1414	g_ai_hdma.errorcallback = NULL;
1415	g_ai_hdma.Init.Request = DMA_SELECT_AI_RX_REQ; /* DMA選択 */
1416	g_ai_hdma.Init.Direction = DMA_PERIPH_TO_MEMORY; /* DMA転送方向 */
1417	g_ai_hdma.Init.SrcMultBlock = DMAC_MULTBLOCK_CONT; /* ソースマルチブロックタイプ */
1418	g_ai_hdma.Init.DrcMultBlock = DMAC_MULTBLOCK_CONT; /* デスティネーションマルチブロックタイプ */
1419	g_ai_hdma.Init.SrcHandShake = DMAC_HS_HARDWARE; /* ソースハンドシェイク */
1420	g_ai_hdma.Init.DrcHandShake = DMAC_HS_SOFTWARE; /* デスティネーションハンドシェイク */
1421	g_ai_hdma.Init.SrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* ソースハードウェアハンドシェイク極性 */
1422	g_ai_hdma.Init.DrcHwhsPol = DMAC_HWHS_POLARITY_LOW; /* デスティネーションハードウェアハンドシェイク極性 */
1423	g_ai_hdma.Init.Priority = 4; /* 優先度 */
1424	g_ai_hdma.Init.SrcMaster = DMAC_MASTER1; /* ソースマスター設定 */
1425	g_ai_hdma.Init.DstMaster = DMAC_MASTER2; /* デスティネーションマスター設定 */
1426	g_ai_hdma.Init.SrcInc = DMAC_ADDR_NOCHANGE; /* ソースインクリメント設定 */
1427	g_ai_hdma.Init.DstInc = DMAC_ADDR_INCREMENT; /* デスティネーションインクリメント設定 */
1428	g_ai_hdma.Init.SrcTransWidth = DMAC_TRANS_WIDTH_32; /* ソース転送幅 */
1429	g_ai_hdma.Init.DstTransWidth = DMAC_TRANS_WIDTH_32; /* デスティネーション転送幅 */
1430	g_ai_hdma.Init.SrcBurstSize = DMAC_MSIZE_4; /* ソースバーストサイズ */
1431	g_ai_hdma.Init.DstBurstSize = DMAC_MSIZE_4; /* デスティネーションバーストサイズ */
1432	g_ai_hdma.Init.IocBlkTrans = 0; /* IOCブロック転送 */
1433	g_ai_hdma.localdata = (void *)ctx;
1434
1435	return dma_init(&g_ai_hdma);
1436	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c

Download in other formats: