Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: azure_iot_hub_riscv/trunk/app_iothub_client/kendryte/kpu.c@ 453

Last change on this file since 453 was 453, checked in by coas-nagasima, 4 years ago
ファイルを追加
Property svn:eol-style set to `native` Property svn:mime-type set to `text/x-csrc;charset=UTF-8`
File size: 64.2 KB

Line
1	#include <assert.h>
2	#include <float.h>
3	#include <math.h>
4	#include <stdio.h>
5	#include <stdlib.h>
6	#include <string.h>
7	#include <stdint.h>
8	#include <kernel.h>
9	#include <t_syslog.h>
10	#include <t_stdlib.h>
11	#include <kernel_impl.h>
12	#include <target_syssvc.h>
13	#include "kendryte-k210.h"
14	#include "device.h"
15	#include "atomic.h"
16	#include "kpu.h"
17	#include "utils.h"
18	#include "kpu_main.h"
19
20	#define sil_orw_mem(a, b) sil_wrw_mem((a), sil_rew_mem(a) \| (b))
21
22	void sysctl_enable_irq(void)
23	{
24	set_csr(mie, MIP_MEIP);
25	set_csr(mstatus, MSTATUS_MIE);
26	}
27
28	void sysctl_disable_irq(void)
29	{
30	clear_csr(mie, MIP_MEIP);
31	clear_csr(mstatus, MSTATUS_MIE);
32	}
33
34	uint64_t sysctl_get_time_us(void)
35	{
36	uint64_t v_cycle = read_cycle();
37	return v_cycle * 1000000 / SYSCTRL_CLOCK_FREQ_IN0;
38	}
39
40	static int is_memory(uintptr_t address)
41	{
42	enum
43	{
44	mem_len = 6 * 1024 * 1024,
45	mem_no_cache_len = 8 * 1024 * 1024,
46	};
47	return ((address >= 0x80000000) && (address < 0x80000000 + mem_len)) \|\| ((address >= 0x40000000) && (address < 0x40000000 + mem_no_cache_len)) \|\| (address == 0x50450040);
48	}
49
50	uint32_t is_memory_cache(uintptr_t address)
51	{
52	#define MEM_CACHE_LEN (6 * 1024 * 1024)
53
54	return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
55	}
56
57	int plic_irq_enable(INTNO irq_number)
58	{
59	if (irq_number != INTNO_AI)
60	return -1;
61	ena_int(irq_number);
62	return 0;
63	}
64
65	int plic_set_priority(INTNO irq_number, uint32_t priority)
66	{
67	if (irq_number != INTNO_AI)
68	return -1;
69	set_ipriority(irq_number, priority);
70	return 0;
71	}
72
73	plic_irq_callback_t ai_done_callback;
74	void *ai_done_ctx;
75
76	void plic_irq_register(INTNO irq, plic_irq_callback_t callback, void *ctx)
77	{
78	ER ret;
79	if (irq != INTNO_AI)
80	return;
81
82	ret = loc_cpu();
83
84	ai_done_callback = callback;
85	ai_done_ctx = ctx;
86
87	if (ret == E_OK)
88	unl_cpu();
89	}
90
91	void ai_done_isr(intptr_t exinf)
92	{
93	sysctl_disable_irq();
94	if (ai_done_callback != NULL){
95	ai_done_callback(ai_done_ctx);
96	}
97	sysctl_enable_irq();
98	}
99
100	plic_irq_callback_t ai_dma_done_callback;
101	void *ai_dma_done_ctx;
102
103	void kpu_dmac_irq_register(dmac_channel_number_t channel_num,
104	plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
105	{
106	ER ret;
107	if (channel_num != AI_DMA_CH)
108	return;
109
110	set_ipriority(INTNO_DMAAI, priority);
111
112	ret = loc_cpu();
113
114	ai_dma_done_callback = dmac_callback;
115	ai_dma_done_ctx = ctx;
116
117	if (ret == E_OK)
118	unl_cpu();
119	}
120
121	void ai_dma_done_isr(DMA_Handle_t *dma)
122	{
123	sysctl_disable_irq();
124	if (ai_dma_done_callback != NULL) {
125	ai_dma_done_callback(ai_dma_done_ctx);
126	}
127	sysctl_enable_irq();
128	}
129
130	void dmac_set_irq(dmac_channel_number_t channel_num,
131	plic_irq_callback_t dmac_callback, void *ctx, uint32_t priority)
132	{
133	ER ret;
134	if (channel_num != AI_DMA_CH)
135	return;
136
137	set_ipriority(INTNO_DMAAI, priority);
138
139	ret = loc_cpu();
140
141	ai_dma_done_callback = dmac_callback;
142	ai_dma_done_ctx = ctx;
143
144	if (ret == E_OK)
145	unl_cpu();
146	}
147
148	DMA_Handle_t g_ai_hdma;
149
150	void dmac_set_single_mode(dmac_channel_number_t channel_num,
151	const void src, void dest, uint8_t src_inc,
152	uint8_t dest_inc,
153	uint8_t dmac_burst_size,
154	uint8_t dmac_trans_width,
155	size_t block_size)
156	{
157	if (channel_num != AI_DMA_CH)
158	return;
159
160	DMA_Handle_t *hdma = &g_ai_hdma;
161	int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
162	uint8_t flow_control;
163	if(mem_type_src == 0 && mem_type_dest == 0)
164	flow_control = DMA_PERIPH_TO_PERIPH;
165	else if(mem_type_src == 1 && mem_type_dest == 0)
166	flow_control = DMA_MEMORY_TO_PERIPH;
167	else if(mem_type_src == 0 && mem_type_dest == 1)
168	flow_control = DMA_PERIPH_TO_MEMORY;
169	else
170	flow_control = DMA_MEMORY_TO_MEMORY;
171
172	hdma->Init.Direction = flow_control; /* DMA転送方向 */
173	hdma->Init.SrcHandShake = (mem_type_src ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* ソースハンドシェイク */
174	hdma->Init.DrcHandShake = (mem_type_dest ? DMAC_HS_SOFTWARE : DMAC_HS_HARDWARE); /* デスティネーションハンドシェイク */
175	hdma->Init.SrcInc = src_inc; /* ソースインクリメント設定 */
176	hdma->Init.DstInc = dest_inc; /* デスティネーションインクリメント設定 */
177	hdma->Init.SrcTransWidth = dmac_trans_width; /* ソース転送幅 */
178	hdma->Init.DstTransWidth = dmac_trans_width; /* デスティネーション転送幅 */
179	hdma->Init.SrcBurstSize = dmac_burst_size; /* ソースバーストサイズ */
180	hdma->Init.DstBurstSize = dmac_burst_size; /* デスティネーションバーストサイズ */
181	dma_reset(hdma);
182
183	dma_start(hdma, (uintptr_t)src, (uintptr_t)dest, block_size);
184	}
185
186	#define LAYER_BURST_SIZE 12
187
188	#define KPU_DEBUG 0
189	#define USE_CACHED_AI_RAM 0
190
191	#define min(a, b) (((a) < (b)) ? (a) : (b))
192	#define max(a, b) (((a) > (b)) ? (a) : (b))
193	#define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
194
195	static int ai_step(void *userdata);
196	static int kpu_kmodel_done(kpu_model_context_t *ctx);
197
198	volatile kpu_config_t const kpu = (volatile kpu_config_t )AI_BASE_ADDR;
199	static volatile uint32_t kpu_status;
200
201	typedef struct kpu_context
202	{
203	kpu_task_t kpu_task;
204	uint32_t kpu_status;
205	} kpu_context_t;
206
207	volatile kpu_context_t g_kpu_context;
208
209	static int kpu_run_all_done(void *_task)
210	{
211	atomic_swap(&g_kpu_context.kpu_status, 0);
212	kpu_task_t task = (kpu_task_t )_task;
213	task->callback(task);
214	return 0;
215	}
216
217	int kpu_continue(void *_task)
218	{
219	kpu_task_t task = (kpu_task_t )_task;
220	int layer_burst_size = 1;
221
222	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
223	.calc_done_int = 1,
224	.layer_cfg_almost_empty_int = 1,
225	.layer_cfg_almost_full_int = 1};
226
227	if(task->remain_layers_length == 0)
228	{
229	return 0;
230	}
231	if(task->remain_layers_length <= layer_burst_size)
232	{
233	for(uint32_t i = 0; i < task->remain_layers_length; i++)
234	{
235	kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
236	kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
237	kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
238	kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
239	kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
240	kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
241	kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
242	kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
243	kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
244	kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
245	kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
246	kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
247	}
248	task->remain_layers_length = 0;
249	} else
250	{
251	for(uint32_t i = 0; i < layer_burst_size; i++)
252	{
253	kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
254	kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
255	kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
256	kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
257	kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
258	kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
259	kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
260	kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
261	kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
262	kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
263	kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
264	kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
265	}
266	task->remain_layers += layer_burst_size;
267	task->remain_layers_length -= layer_burst_size;
268	}
269	return 0;
270	}
271
272	static int kpu_run_dma_output(uint32_t dma_ch, void dst, uint32_t length, plic_irq_callback_t cb, void _task)
273	{
274	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
275	kpu_dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);
276	dmac_set_single_mode(dma_ch, (void )(&kpu->fifo_data_out), (void )(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
277	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
278	return 0;
279	}
280
281	static int kpu_run_dma_input_done_push_layers(void *_task)
282	{
283	kpu_task_t task = (kpu_task_t )_task;
284	kpu->interrupt_clear.reg = 7;
285	dma_end(&g_ai_hdma);
286	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
287	.fifo_full_threshold = 10, .fifo_empty_threshold = 1};
288	kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
289	.eight_bit_mode = task->eight_bit_mode};
290
291	kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
292
293	kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
294
295	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
296	.calc_done_int = 0,
297	.layer_cfg_almost_empty_int = 0,
298	.layer_cfg_almost_full_int = 1};
299	kpu_continue(task);
300	return 0;
301	}
302
303	static void kpu_run_dma_input(uint32_t dma_ch, const void src, plic_irq_callback_t cb, void _task)
304	{
305	kpu_task_t *task = _task;
306	kpu_layer_argument_t *first_layer = &task->layers[0];
307	uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
308	kpu_dmac_irq_register(dma_ch, cb, _task, 1);
309	dmac_set_single_mode(dma_ch, (void )src, (void )(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
310	DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
311	}
312
313	int kpu_run(kpu_task_t v_task, dmac_channel_number_t dma_ch, const void src, void *dest, plic_irq_callback_t callback)
314	{
315	if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
316	return -1;
317
318	memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
319	kpu_task_t task = (kpu_task_t )&g_kpu_context.kpu_task;
320
321	kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
322
323	uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
324
325	last_layer->dma_parameter.data.send_data_out = 1;
326	last_layer->interrupt_enabe.data.int_en = 1;
327
328	task->dma_ch = dma_ch;
329	task->dst = dest;
330	task->dst_length = output_size;
331	task->callback = callback;
332	task->remain_layers_length = task->layers_length;
333	task->remain_layers = task->layers;
334
335	plic_set_priority(INTNO_AI, 1);
336	plic_irq_register(INTNO_AI, kpu_continue, task);
337	plic_irq_enable(INTNO_AI);
338
339	kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);
340
341	return 0;
342	}
343
344	uint8_t kpu_get_output_buf(kpu_task_t task)
345	{
346	kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
347	size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
348	return malloc(output_size);
349	}
350
351	void kpu_release_output_buf(uint8_t *output_buf)
352	{
353	if(output_buf != NULL)
354	free(output_buf);
355	}
356
357	static int kpu_done(void *ctx)
358	{
359	atomic_swap(&kpu_status, 0);
360	kpu_task_t task = (kpu_task_t )ctx;
361	task->callback(task->ctx);
362	return 0;
363	}
364
365	static int kpu_config_input(void *ctx)
366	{
367	kpu_task_t task = (kpu_task_t )ctx;
368	kpu->interrupt_clear.reg = 7;
369	if(task->remain_layers_length <= LAYER_BURST_SIZE)
370	{
371	for(uint32_t i = 0; i < task->remain_layers_length; i++)
372	{
373	kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
374	kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
375	kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
376	kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
377	kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
378	kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
379	kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
380	kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
381	kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
382	kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
383	kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
384	kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
385	}
386	task->remain_layers_length = 0;
387	kpu->interrupt_mask.reg = 7;
388	} else
389	{
390	for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)
391	{
392	kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
393	kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
394	kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
395	kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
396	kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
397	kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
398	kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
399	kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
400	kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
401	kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
402	kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
403	kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
404	}
405	task->remain_layers += LAYER_BURST_SIZE;
406	task->remain_layers_length -= LAYER_BURST_SIZE;
407	}
408	return 0;
409	}
410
411	static void kpu_data_output(kpu_task_t *task)
412	{
413	select_dma_channel(task->dma_ch, DMA_SELECT_AI_RX_REQ);
414	kpu_dmac_irq_register(task->dma_ch, kpu_done, task, 1);
415	dmac_set_single_mode(task->dma_ch, (void )(&kpu->fifo_data_out), (void )(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
416	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);
417	}
418
419	static int kpu_data_ready(void *ctx)
420	{
421	kpu_task_t task = (kpu_task_t )ctx;
422
423	dma_end(&g_ai_hdma);
424	kpu_data_output(task);
425
426	kpu->eight_bit_mode.reg = task->eight_bit_mode;
427	kpu->interrupt_mask.reg = 7;
428	kpu->interrupt_clear.reg = 7;
429	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
430	.fifo_full_threshold = 12, .fifo_empty_threshold = 1};
431
432	plic_set_priority(INTNO_AI, 2);
433	plic_irq_register(INTNO_AI, kpu_config_input, task);
434	plic_irq_enable(INTNO_AI);
435	kpu_config_input(task);
436	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
437	.calc_done_int = 1,
438	.layer_cfg_almost_empty_int = 0,
439	.layer_cfg_almost_full_int = 1};
440	return 0;
441	}
442
443	static void kpu_data_input(kpu_task_t *task)
444	{
445	if(task->src == NULL)
446	{
447	kpu_data_ready(task);
448	return;
449	}
450	kpu_dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);
451	kpu_layer_argument_t *layer = &task->layers[0];
452	dmac_set_single_mode(task->dma_ch, (void )(uintptr_t)task->src, (void )(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
453	DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);
454	}
455
456	int kpu_single_task_init(kpu_task_t *task)
457	{
458	/*
459	* AIクロック有効化
460	*/
461	sil_orw_mem((uint32_t *)(TADR_SYSCTL_BASE+TOFF_SYSCTL_CLK_EN_PERI), SYSCTL_CLK_EN_PERI_AI_CLK_EN);
462
463	kpu_layer_argument_t *first_layer = &task->layers[0];
464	kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
465
466	last_layer->dma_parameter.data.send_data_out = 1;
467	last_layer->interrupt_enabe.data.int_en = 1;
468	task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;
469	task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;
470	task->dst = (uint64_t )malloc(task->dst_length 8);
471	if(task->dst == NULL)
472	return 1;
473	memset(task->dst, 0, task->dst_length * 8);
474	return 0;
475	}
476
477	int kpu_single_task_deinit(kpu_task_t *task)
478	{
479	free(task->dst);
480	return 0;
481	}
482
483	int kpu_model_load_from_buffer(kpu_task_t task, uint8_t buffer, kpu_model_layer_metadata_t **meta)
484	{
485	uintptr_t base_addr = (uintptr_t)buffer;
486	kpu_model_header_t header = (kpu_model_header_t )buffer;
487	kpu_model_layer_metadata_t layer_meta = (kpu_model_layer_metadata_t )(base_addr + sizeof(kpu_model_header_t));
488	kpu_layer_argument_t layers = (kpu_layer_argument_t )(base_addr + header->layers_argument_start);
489
490	if(header->version != 1)
491	return -1;
492	uint32_t layers_length = header->layers_length;
493	task->layers_length = layers_length;
494	task->eight_bit_mode = header->flags & 1;
495	task->layers = layers;
496	task->output_scale = layer_meta[layers_length - 1].output_scale;
497	task->output_bias = layer_meta[layers_length - 1].output_bias;
498	size_t i;
499	for(i = 0; i < layers_length; i++)
500	{
501	layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);
502	layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);
503	layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);
504	}
505
506	if(meta)
507	*meta = layer_meta;
508	return 0;
509	}
510
511	int kpu_start(kpu_task_t *task)
512	{
513	if(atomic_cas(&kpu_status, 0, 1))
514	return -1;
515
516	task->remain_layers_length = task->layers_length;
517	task->remain_layers = task->layers;
518	kpu_data_input(task);
519	return 0;
520	}
521
522	static void kpu_send_layer(const kpu_layer_argument_t *layer)
523	{
524	kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
525	kpu->layer_argument_fifo = layer->image_addr.reg;
526	kpu->layer_argument_fifo = layer->image_channel_num.reg;
527	kpu->layer_argument_fifo = layer->image_size.reg;
528	kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
529	kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
530	kpu->layer_argument_fifo = layer->kernel_offset.reg;
531	kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
532	kpu->layer_argument_fifo = layer->write_back_cfg.reg;
533	kpu->layer_argument_fifo = layer->conv_value.reg;
534	kpu->layer_argument_fifo = layer->conv_value2.reg;
535	kpu->layer_argument_fifo = layer->dma_parameter.reg;
536	}
537
538	void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
539	{
540	kpu->interrupt_clear.reg = 7;
541	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
542	.fifo_full_threshold = 10, .fifo_empty_threshold = 1};
543	kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
544	.eight_bit_mode = eight_bit_mode};
545	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
546	.calc_done_int = 1,
547	.layer_cfg_almost_empty_int = 0,
548	.layer_cfg_almost_full_int = 1};
549
550	plic_set_priority(INTNO_AI, 1);
551	plic_irq_register(INTNO_AI, callback, userdata);
552	plic_irq_enable(INTNO_AI);
553	}
554
555	void kpu_input_dma(const kpu_layer_argument_t layer, const uint8_t src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
556	{
557	uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
558	dmac_set_irq(dma_ch, callback, userdata, 1);
559	dmac_set_single_mode(dma_ch, (void )src, (void )(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
560	DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
561	}
562
563	static void kpu_conv2d_core(kpu_layer_argument_t *layer)
564	{
565	kpu_send_layer(layer);
566	}
567
568	void kpu_conv2d(kpu_layer_argument_t *layer)
569	{
570	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
571	.calc_done_int = 1,
572	.layer_cfg_almost_empty_int = 1,
573	.layer_cfg_almost_full_int = 1};
574	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
575	.calc_done_int = 1,
576	.layer_cfg_almost_empty_int = 0,
577	.layer_cfg_almost_full_int = 1};
578	kpu_conv2d_core(layer);
579	}
580
581	void kpu_conv2d_output(kpu_layer_argument_t layer, dmac_channel_number_t dma_ch, uint8_t dest, plic_irq_callback_t callback, void *userdata)
582	{
583	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
584	.calc_done_int = 1,
585	.layer_cfg_almost_empty_int = 1,
586	.layer_cfg_almost_full_int = 1};
587	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
588	.calc_done_int = 1,
589	.layer_cfg_almost_empty_int = 1,
590	.layer_cfg_almost_full_int = 1};
591	layer->dma_parameter.data.send_data_out = 1;
592	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
593	dmac_set_irq(dma_ch, callback, userdata, 1);
594	dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
595	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
596	kpu_conv2d_core(layer);
597	}
598
599	void kpu_conv2d_output_full_add(kpu_layer_argument_t layer, dmac_channel_number_t dma_ch, uint64_t dest, plic_irq_callback_t callback, void *userdata)
600	{
601	uint32_t channels = layer->image_channel_num.data.o_ch_num + 1;
602	layer->interrupt_enabe.data.full_add = 1;
603
604	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
605	.calc_done_int = 1,
606	.layer_cfg_almost_empty_int = 1,
607	.layer_cfg_almost_full_int = 1};
608	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
609	.calc_done_int = 1,
610	.layer_cfg_almost_empty_int = 1,
611	.layer_cfg_almost_full_int = 1};
612	layer->dma_parameter.data.send_data_out = 1;
613	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
614	dmac_set_irq(dma_ch, callback, userdata, 1);
615	dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
616	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels);
617	kpu_conv2d_core(layer);
618	}
619
620	void kpu_add(const uint8_t src1, const quantize_param_t src1_param, const uint8_t src2, const quantize_param_t src2_param, size_t count, uint8_t dest, const quantize_param_t dest_param)
621	{
622	quantize_param_t q1 = src1_param, q2 = src2_param, q3 = *dest_param;
623
624	size_t i;
625	for(i = 0; i < count; i++)
626	{
627	int value = ((src1++ q1.scale + q1.bias + src2++ q2.scale + q2.bias) - q3.bias) / q3.scale;
628	if(value < 0)
629	value = 0;
630	if(value > 0xFF)
631	value = 0xFF;
632	*dest++ = value;
633	}
634	}
635
636	void kpu_global_average_pool(const uint8_t src, const quantize_param_t src_param, int kernel_size, int channels, uint8_t dest, const quantize_param_t dest_param)
637	{
638	quantize_param_t q1 = src_param, q2 = dest_param;
639	size_t oc, y, x;
640
641	if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
642	{
643	uint32_t row_padding = 16;
644	uint32_t row_group = 4;
645	uint32_t row_length = 1;
646	uint32_t height = 4;
647
648	for(oc = 0; oc < channels; oc++)
649	{
650	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
651	for(y = 0; y < 1; y++)
652	{
653	uint8_t y_origin = channel_origin + y row_length * 64;
654	for(x = 0; x < 1; x++)
655	{
656	int64_t sum = 0;
657	size_t i;
658	for(i = 0; i < kernel_size; i++)
659	sum += *src++;
660
661	int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
662	if(value < 0)
663	value = 0;
664	if(value > 0xFF)
665	value = 0xFF;
666	y_origin[x] = value;
667	}
668	}
669	}
670	} else
671	{
672	for(oc = 0; oc < channels; oc++)
673	{
674	int64_t sum = 0;
675	size_t i;
676	for(i = 0; i < kernel_size; i++)
677	sum += *src++;
678
679	int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
680	if(value < 0)
681	value = 0;
682	if(value > 0xFF)
683	value = 0xFF;
684	dest[oc] = value;
685	}
686	}
687	}
688
689	void kpu_global_average_pool_float(const uint8_t src, const quantize_param_t src_param, int kernel_size, int channels, float *dest)
690	{
691	quantize_param_t q = *src_param;
692	size_t oc;
693
694	for(oc = 0; oc < channels; oc++)
695	{
696	int64_t sum = 0;
697	size_t i;
698	for(i = 0; i < kernel_size; i++)
699	sum += *src++;
700
701	float value = (sum * q.scale + q.bias) / kernel_size;
702	dest[oc] = value;
703	}
704	}
705
706	void kpu_matmul_end(const uint8_t src, int channels, float dest, const quantize_param_t *dest_param)
707	{
708	quantize_param_t q1 = *dest_param;
709	size_t i = 0;
710	for(i = 0; i < channels; i++)
711	dest++ = src[i 16] * q1.scale + q1.bias;
712	}
713
714	void kpu_fully_connected(const float src, const float weights, const float biases, float dest, int input_channels, int output_channels)
715	{
716	int ic, oc;
717	for(oc = 0; oc < output_channels; oc++)
718	{
719	const float c_weights = weights + oc input_channels;
720
721	float sum = 0.0f;
722	for(ic = 0; ic < input_channels; ic++)
723	sum += src[ic] * c_weights[ic];
724	dest[oc] = sum + biases[oc];
725	}
726	}
727
728	void kpu_dequantize(const uint8_t src, const quantize_param_t src_param, size_t count, float *dest)
729	{
730	quantize_param_t q1 = *src_param;
731	size_t i = 0;
732	for(i = 0; i < count; i++)
733	dest++ = src[i] q1.scale + q1.bias;
734	}
735
736	void kpu_input_with_padding(kpu_layer_argument_t layer, const uint8_t src, int width, int height, int channels)
737	{
738	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64);
739	size_t oc, y, x;
740
741	uint32_t row_padding;
742	uint32_t row_group;
743	uint32_t row_length;
744
745	if(width <= 16)
746	{
747	row_padding = 16;
748	row_group = 4;
749	row_length = 1;
750	} else if(width <= 32)
751	{
752	row_padding = 32;
753	row_group = 2;
754	row_length = 1;
755	} else
756	{
757	row_padding = 64;
758	row_group = 1;
759	row_length = (width + 63) / 64;
760	}
761
762	for(oc = 0; oc < channels; oc++)
763	{
764	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
765	for(y = 0; y < height; y++)
766	{
767	uint8_t y_origin = channel_origin + y row_length * 64;
768	for(x = 0; x < width; x++)
769	y_origin[x] = *src++;
770	}
771	}
772	}
773	#if USE_CACHED_AI_RAM
774	static void kpu_flush_cache(uint32_t addr, size_t lines)
775	{
776	size_t line;
777	for(line = 0; line < lines; line++)
778	{
779	const uint64_t src = (const uint64_t )(AI_RAM_BASE_ADDR + (addr + line) * 64);
780	uint64_t dest = (uint64_t )(AI_IO_BASE_ADDR + (addr + line) * 64);
781	size_t i;
782	for(i = 0; i < 8; i++)
783	dest[i] = src[i];
784	}
785	}
786	#endif
787	static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
788	{
789	if(shift > 0)
790	{
791	value >>= shift - 1;
792	if(value & 0x1)
793	{
794	if(value < 0)
795	value = (value >> 1) - 1;
796	else
797	value = (value >> 1) + 1;
798	} else
799	{
800	value >>= 1;
801	}
802	}
803
804	return value;
805	}
806	static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
807	{
808	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
809	size_t oc, y, x;
810	uint32_t row_padding;
811	uint32_t row_group;
812	uint32_t row_length;
813	if(width <= 16)
814	{
815	row_padding = 16;
816	row_group = 4;
817	row_length = 1;
818	} else if(width <= 32)
819	{
820	row_padding = 32;
821	row_group = 2;
822	row_length = 1;
823	} else
824	{
825	row_padding = 64;
826	row_group = 1;
827	row_length = (width + 63) / 64;
828	}
829
830	if((uintptr_t)src % 8 == 0 && width % 8 == 0)
831	{
832	#define UPLOAD_BEGIN() \
833	for(oc = 0; oc < channels; oc++) \
834	{ \
835	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding; \
836	for(y = 0; y < height; y++) \
837	{ \
838	uint64_t y_origin = (uint64_t )(channel_origin + y * row_length * 64);
839
840	#define UPLOAD_END() \
841	} \
842	}
843
844	width /= 8;
845	const uint64_t u64_src = (const uint64_t )src;
846	if(width == 1)
847	{
848	UPLOAD_BEGIN()
849	y_origin[0] = *u64_src++;
850	UPLOAD_END()
851	} else if(width == 2)
852	{
853	UPLOAD_BEGIN()
854	{
855	y_origin[0] = *u64_src++;
856	y_origin[1] = *u64_src++;
857	}
858	UPLOAD_END()
859	} else if(width == 4)
860	{
861	UPLOAD_BEGIN()
862	{
863	y_origin[0] = *u64_src++;
864	y_origin[1] = *u64_src++;
865	y_origin[2] = *u64_src++;
866	y_origin[3] = *u64_src++;
867	}
868	UPLOAD_END()
869	} else
870	{
871	UPLOAD_BEGIN()
872	for(x = 0; x < width; x++)
873	y_origin[x] = *u64_src++;
874	UPLOAD_END()
875	}
876	} else
877	{
878	for(oc = 0; oc < channels; oc++)
879	{
880	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
881	for(y = 0; y < height; y++)
882	{
883	uint8_t y_origin = channel_origin + y row_length * 64;
884	for(x = 0; x < width; x++)
885	y_origin[x] = *src++;
886	}
887	}
888	}
889	}
890	static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t layer, const uint8_t src)
891	{
892	size_t width = layer->image_size.data.i_row_wid + 1;
893	size_t height = layer->image_size.data.i_col_high + 1;
894	size_t channels = layer->image_channel_num.data.i_ch_num + 1;
895
896	kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
897	}
898
899	static void kpu_kmodel_input_float(const float src, float dest, size_t count)
900	{
901	memcpy(dest, src, count * sizeof(float));
902	}
903
904	static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
905	{
906	size_t i;
907
908	if(act == KLA_RELU)
909	{
910	for(i = 0; i < count; i++)
911	data[i] = max(data[i], 0);
912	} else if(act == KLA_RELU6)
913	{
914	for(i = 0; i < count; i++)
915	data[i] = min(max(data[i], 0), 6);
916	}
917	}
918
919	static void kpu_kmodel_add(const kpu_model_add_layer_argument_t arg, kpu_model_context_t ctx)
920	{
921	const float src_a = (const float )(ctx->main_buffer + arg->main_mem_in_a_address);
922	const float src_b = (const float )(ctx->main_buffer + arg->main_mem_in_b_address);
923	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
924	size_t i, count = arg->count;
925
926	for(i = 0; i < count; i++)
927	dest[i] = src_a[i] + src_b[i];
928	}
929
930	static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t arg, kpu_model_context_t ctx)
931	{
932	const uint8_t src_a = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_a_address);
933	const uint8_t src_b = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_b_address);
934	size_t count = ALIGN_UP(arg->count, 8) / 8;
935	int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
936	int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
937	int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
938
939	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
940	size_t i;
941
942	if(sh_a == sh_b)
943	{
944	#define QADD_UNROLL_1(x) \
945	int64_t a##x = *src_a++; \
946	int64_t b##x = *src_b++;
947
948	#define QADD_UNROLL_2(x) \
949	a##x += off_a; \
950	b##x += off_b;
951
952	#define QADD_UNROLL_3(x) \
953	a##x *= mul_a; \
954	b##x *= mul_b;
955
956	#define QADD_UNROLL_4(x) \
957	int64_t v##x = a##x + b##x;
958
959	#define QADD_UNROLL_5(x) \
960	v##x >>= sh_a;
961
962	#define QADD_UNROLL_6(x) \
963	v##x *= mul_o;
964
965	#define QADD_UNROLL_7(x) \
966	v##x = kpu_carry_shift(v##x, sh_o);
967
968	#define QADD_UNROLL_8(x) \
969	v##x += off_o;
970
971	#define QADD_UNROLL_9(x) \
972	v##x = min(0xFF, max(0, v##x));
973
974	#define QADD_UNROLL_10(x) \
975	*dest++ = v##x;
976
977	#define QADD_UNROLL_S(x) \
978	QADD_UNROLL_##x(0) \
979	QADD_UNROLL_##x(1) \
980	QADD_UNROLL_##x(2) \
981	QADD_UNROLL_##x(3) \
982	QADD_UNROLL_##x(4) \
983	QADD_UNROLL_##x(5) \
984	QADD_UNROLL_##x(6) \
985	QADD_UNROLL_##x(7)
986
987	for(i = 0; i < count; i++)
988	{
989	QADD_UNROLL_S(1);
990	QADD_UNROLL_S(2);
991	QADD_UNROLL_S(3);
992	QADD_UNROLL_S(4);
993	QADD_UNROLL_S(5);
994	QADD_UNROLL_S(6);
995	QADD_UNROLL_S(7);
996	QADD_UNROLL_S(8);
997	QADD_UNROLL_S(9);
998	QADD_UNROLL_S(10);
999	}
1000	} else
1001	{
1002	#undef QADD_UNROLL_1
1003	#define QADD_UNROLL_1(x) \
1004	int64_t a##x = *src_a++; \
1005	int64_t b##x = *src_b++;
1006
1007	#undef QADD_UNROLL_2
1008	#define QADD_UNROLL_2(x) \
1009	a##x += off_a; \
1010	b##x += off_b;
1011
1012	#undef QADD_UNROLL_3
1013	#define QADD_UNROLL_3(x) \
1014	a##x *= mul_a; \
1015	b##x *= mul_b;
1016
1017	#undef QADD_UNROLL_4
1018	#define QADD_UNROLL_4(x) \
1019	a##x >>= sh_a; \
1020	b##x >>= sh_b;
1021
1022	#undef QADD_UNROLL_5
1023	#define QADD_UNROLL_5(x) \
1024	int64_t v##x = a##x + b##x;
1025
1026	#undef QADD_UNROLL_6
1027	#define QADD_UNROLL_6(x) \
1028	v##x *= mul_o;
1029
1030	#undef QADD_UNROLL_7
1031	#define QADD_UNROLL_7(x) \
1032	v##x = kpu_carry_shift(v##x, sh_o);
1033
1034	#undef QADD_UNROLL_8
1035	#define QADD_UNROLL_8(x) \
1036	v##x += off_o;
1037
1038	#undef QADD_UNROLL_9
1039	#define QADD_UNROLL_9(x) \
1040	v##x = min(0xFF, max(0, v##x));
1041
1042	#undef QADD_UNROLL_10
1043	#define QADD_UNROLL_10(x) \
1044	*dest++ = v##x;
1045
1046	#undef QADD_UNROLL_S
1047	#define QADD_UNROLL_S(x) \
1048	QADD_UNROLL_##x(0) \
1049	QADD_UNROLL_##x(1) \
1050	QADD_UNROLL_##x(2) \
1051	QADD_UNROLL_##x(3) \
1052	QADD_UNROLL_##x(4) \
1053	QADD_UNROLL_##x(5) \
1054	QADD_UNROLL_##x(6) \
1055	QADD_UNROLL_##x(7)
1056
1057	for(i = 0; i < count; i++)
1058	{
1059	QADD_UNROLL_S(1);
1060	QADD_UNROLL_S(2);
1061	QADD_UNROLL_S(3);
1062	QADD_UNROLL_S(4);
1063	QADD_UNROLL_S(5);
1064	QADD_UNROLL_S(6);
1065	QADD_UNROLL_S(7);
1066	QADD_UNROLL_S(8);
1067	QADD_UNROLL_S(9);
1068	QADD_UNROLL_S(10);
1069	}
1070	}
1071	}
1072
1073	static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t arg, kpu_model_context_t ctx)
1074	{
1075	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1076	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1077	size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
1078
1079	for(oc = 0; oc < channels; oc++)
1080	{
1081	float sum = 0.f;
1082	size_t i;
1083	for(i = 0; i < kernel_size; i++)
1084	sum += *src++;
1085
1086	dest[oc] = sum / kernel_size;
1087	}
1088	}
1089
1090	static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t arg, kpu_model_context_t ctx)
1091	{
1092	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1093	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
1094	kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
1095	uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
1096	uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
1097	uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
1098
1099	uint32_t out_y, out_x, oc;
1100
1101	for(oc = 0; oc < out_shape.channels; oc++)
1102	{
1103	const uint8_t channel_src = src + in_shape.width in_shape.height * oc;
1104	for(out_y = 0; out_y < out_shape.height; out_y++)
1105	{
1106	for(out_x = 0; out_x < out_shape.width; out_x++)
1107	{
1108	int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
1109	int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
1110	int32_t kernel_x_start = max(0, -in_x_origin);
1111	int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
1112	int32_t kernel_y_start = max(0, -in_y_origin);
1113	int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
1114	uint8_t value = 0;
1115
1116	int32_t kernel_y, kernel_x;
1117	for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
1118	{
1119	for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
1120	{
1121	int32_t in_x = in_x_origin + kernel_x;
1122	int32_t in_y = in_y_origin + kernel_y;
1123	value = max(value, channel_src[in_y * in_shape.width + in_x]);
1124	}
1125	}
1126
1127	*dest++ = value;
1128	}
1129	}
1130	}
1131	}
1132
1133	static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t arg, kpu_model_context_t ctx)
1134	{
1135	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1136	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1137	kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
1138	uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
1139	uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
1140	uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
1141
1142	uint32_t out_y, out_x, oc;
1143
1144	for(oc = 0; oc < out_shape.channels; oc++)
1145	{
1146	const float channel_src = src + in_shape.width in_shape.height * oc;
1147	for(out_y = 0; out_y < out_shape.height; out_y++)
1148	{
1149	for(out_x = 0; out_x < out_shape.width; out_x++)
1150	{
1151	int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
1152	int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
1153	int32_t kernel_x_start = max(0, -in_x_origin);
1154	int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
1155	int32_t kernel_y_start = max(0, -in_y_origin);
1156	int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
1157	float value = 0;
1158	float kernel_count = 0;
1159
1160	int32_t kernel_y, kernel_x;
1161	for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
1162	{
1163	for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
1164	{
1165	int32_t in_x = in_x_origin + kernel_x;
1166	int32_t in_y = in_y_origin + kernel_y;
1167	value += channel_src[in_y * in_shape.width + in_x];
1168	kernel_count++;
1169	}
1170	}
1171
1172	*dest++ = value / kernel_count;
1173	}
1174	}
1175	}
1176	}
1177
1178	static void kpu_quantize(const kpu_model_quantize_layer_argument_t arg, kpu_model_context_t ctx)
1179	{
1180	size_t count = arg->count;
1181	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1182
1183	kpu_model_quant_param_t q = arg->quant_param;
1184
1185	float scale = 1.f / q.scale;
1186
1187	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->mem_out_address);
1188	size_t i;
1189	for(i = 0; i < count; i++)
1190	{
1191	int value = roundf((src++ - q.bias) scale);
1192	if(value < 0)
1193	value = 0;
1194	if(value > 0xFF)
1195	value = 0xFF;
1196	*dest++ = (uint8_t)value;
1197	}
1198	}
1199
1200	static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t arg, kpu_model_context_t ctx)
1201	{
1202	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1203	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1204	size_t oc, count = arg->count;
1205	kpu_model_quant_param_t q = arg->quant_param;
1206
1207	for(oc = 0; oc < count; oc++)
1208	dest[oc] = src++ q.scale + q.bias;
1209	}
1210
1211	static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t arg, kpu_model_context_t ctx)
1212	{
1213	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1214	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1215	size_t oc, i, channels = arg->channels, count = arg->channel_size;
1216
1217	for(oc = 0; oc < channels; oc++)
1218	{
1219	const kpu_model_quant_param_t q = arg->quant_params[oc];
1220
1221	for(i = 0; i < count; i++)
1222	dest++ = src++ * q.scale + q.bias;
1223	}
1224	}
1225
1226	static void kpu_requantize(const kpu_model_requantize_layer_argument_t arg, kpu_model_context_t ctx)
1227	{
1228	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1229	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
1230	size_t oc, count = arg->count;
1231	const uint8_t *table = arg->table;
1232
1233	if(false && count % 8 == 0)
1234	{
1235	for(oc = 0; oc < count;)
1236	{
1237	dest[oc++] = table[*src++];
1238	dest[oc++] = table[*src++];
1239	dest[oc++] = table[*src++];
1240	dest[oc++] = table[*src++];
1241	dest[oc++] = table[*src++];
1242	dest[oc++] = table[*src++];
1243	dest[oc++] = table[*src++];
1244	dest[oc++] = table[*src++];
1245	}
1246	} else
1247	{
1248	for(oc = 0; oc < count; oc++)
1249	dest[oc] = table[src[oc]];
1250	}
1251	}
1252
1253	static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t arg, kpu_model_context_t ctx)
1254	{
1255	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1256	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1257	size_t oc, channels = arg->channels;
1258
1259	float sum = 0.f;
1260	const float epsilon = 1e-10f;
1261	for(oc = 0; oc < channels; oc++)
1262	sum += src[oc] * src[oc];
1263	if(sum < epsilon)
1264	sum = epsilon;
1265	sum = 1.f / sqrtf(sum);
1266	for(oc = 0; oc < channels; oc++)
1267	dest[oc] = src[oc] * sum;
1268	}
1269
1270	static void kpu_softmax(const kpu_model_softmax_layer_argument_t arg, kpu_model_context_t ctx)
1271	{
1272	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1273	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1274	size_t oc, channels = arg->channels;
1275
1276	float max = FLT_MIN;
1277	for(oc = 0; oc < channels; oc++)
1278	max = fmaxf(max, src[oc]);
1279
1280	float sum = 0.f;
1281	for(oc = 0; oc < channels; oc++)
1282	{
1283	float value = expf(src[oc] - max);
1284	sum += value;
1285	dest[oc] = value;
1286	}
1287
1288	for(oc = 0; oc < channels; oc++)
1289	dest[oc] /= sum;
1290	}
1291
1292	static void kpu_concat(const kpu_model_concat_layer_argument_t arg, kpu_model_context_t ctx)
1293	{
1294	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
1295	uint32_t count = arg->input_count, i;
1296
1297	for(i = 0; i < count; i++)
1298	{
1299	kpu_model_memory_range_t input = arg->inputs_mem[i];
1300	const uint8_t src = (const uint8_t )(ctx->main_buffer + input.start);
1301	memcpy(dest, src, input.size);
1302	dest += input.size;
1303	}
1304	}
1305
1306	static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t arg, kpu_model_context_t ctx)
1307	{
1308	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1309	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1310	uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
1311	float weights = (float )malloc(in_channels * out_channels * sizeof(float));
1312	float bias = (float )malloc(out_channels * sizeof(float));
1313	memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
1314	memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
1315
1316	if(in_channels % 8 == 0)
1317	{
1318	#define FC_UNROLL_1(x) \
1319	float i##x = *c_src++; \
1320	float w##x = *c_weights++;
1321
1322	#define FC_UNROLL_2(x) \
1323	sum += i##x * w##x;
1324
1325	#define FC_UNROLL_S(x) \
1326	FC_UNROLL_##x(0) \
1327	FC_UNROLL_##x(1) \
1328	FC_UNROLL_##x(2) \
1329	FC_UNROLL_##x(3) \
1330	FC_UNROLL_##x(4) \
1331	FC_UNROLL_##x(5) \
1332	FC_UNROLL_##x(6) \
1333	FC_UNROLL_##x(7)
1334
1335	for(oc = 0; oc < out_channels; oc++)
1336	{
1337	const float *c_src = src;
1338	const float c_weights = weights + oc in_channels;
1339
1340	float sum = 0.0f;
1341	for(ic = 0; ic < in_channels / 8; ic++)
1342	{
1343	FC_UNROLL_S(1);
1344	FC_UNROLL_S(2);
1345	}
1346
1347	dest[oc] = sum + bias[oc];
1348	}
1349	} else
1350	{
1351	for(oc = 0; oc < out_channels; oc++)
1352	{
1353	const float c_weights = weights + oc in_channels;
1354
1355	float sum = 0.0f;
1356	for(ic = 0; ic < in_channels; ic++)
1357	sum += src[ic] * c_weights[ic];
1358	dest[oc] = sum + bias[oc];
1359	}
1360	}
1361	free(weights);
1362	free(bias);
1363	kpu_float_activation(dest, out_channels, arg->act);
1364	}
1365
1366	static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t arg, kpu_model_context_t ctx)
1367	{
1368	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1369	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1370	kpu_model_shape_t in_shape = arg->shape;
1371	uint32_t oc, oy, ox;
1372
1373	for(oy = 0; oy < in_shape.height; oy++)
1374	for(ox = 0; ox < in_shape.width; ox++)
1375	for(oc = 0; oc < in_shape.channels; oc++)
1376	dest++ = src[(oc in_shape.height + oy) * in_shape.width + ox];
1377	}
1378
1379	static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t arg, kpu_model_context_t ctx)
1380	{
1381	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1382	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1383	kpu_model_shape_t in_shape = arg->in_shape;
1384	uint32_t out_width = arg->out_width, out_height = arg->out_height;
1385	uint32_t oc, oy, ox;
1386
1387	float height_scale = (float)in_shape.height / out_height;
1388	float width_scale = (float)in_shape.width / out_width;
1389
1390	for(oc = 0; oc < in_shape.channels; oc++)
1391	{
1392	const float channel_src = src + in_shape.width in_shape.height * oc;
1393	for(oy = 0; oy < out_height; oy++)
1394	{
1395	uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
1396	const float y_origin = channel_src + in_y in_shape.width;
1397	for(ox = 0; ox < out_width; ox++)
1398	{
1399	uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
1400	*dest++ = y_origin[in_x];
1401	}
1402	}
1403	}
1404	}
1405
1406	static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t arg, kpu_model_context_t ctx)
1407	{
1408	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1409	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
1410	kpu_model_shape_t in_shape = arg->in_shape;
1411	uint32_t out_width = arg->out_width, out_height = arg->out_height;
1412	uint32_t oc, oy, ox;
1413
1414	float height_scale = (float)in_shape.height / out_height;
1415	float width_scale = (float)in_shape.width / out_width;
1416
1417	for(oc = 0; oc < in_shape.channels; oc++)
1418	{
1419	const uint8_t channel_src = src + in_shape.width in_shape.height * oc;
1420	for(oy = 0; oy < out_height; oy++)
1421	{
1422	uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
1423	const uint8_t y_origin = channel_src + in_y in_shape.width;
1424	for(ox = 0; ox < out_width; ox++)
1425	{
1426	uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
1427	*dest++ = y_origin[in_x];
1428	}
1429	}
1430	}
1431	}
1432
1433	static void kpu_logistic(const kpu_model_logistic_layer_argument_t arg, kpu_model_context_t ctx)
1434	{
1435	const float src = (const float )(ctx->main_buffer + arg->main_mem_in_address);
1436	float dest = (float )(ctx->main_buffer + arg->main_mem_out_address);
1437	size_t oc, channels = arg->channels;
1438
1439	for(oc = 0; oc < channels; oc++)
1440	dest[oc] = 1.f / (1.f + expf(-src[oc]));
1441	}
1442
1443	static void kpu_conv(const kpu_model_conv_layer_argument_t arg, kpu_model_context_t ctx)
1444	{
1445	volatile kpu_layer_argument_t layer = (const volatile kpu_layer_argument_t )(ctx->model_buffer + arg->layer_offset);
1446	layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
1447	layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
1448	layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
1449
1450	if(arg->flags & KLF_MAIN_MEM_OUT)
1451	{
1452	dmac_channel_number_t dma_ch = ctx->dma_ch;
1453	uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
1454	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1455	.calc_done_int = 1,
1456	.layer_cfg_almost_empty_int = 1,
1457	.layer_cfg_almost_full_int = 1};
1458	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1459	.calc_done_int = 1,
1460	.layer_cfg_almost_empty_int = 1,
1461	.layer_cfg_almost_full_int = 1};
1462	layer.dma_parameter.data.send_data_out = 1;
1463	select_dma_channel(dma_ch, DMA_SELECT_AI_RX_REQ);
1464	if(ctx->current_layer < ctx->layers_length)
1465	dmac_set_irq(dma_ch, ai_step, ctx, 1);
1466	else
1467	dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
1468	dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
1469	DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
1470	} else
1471	{
1472	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1473	.calc_done_int = 1,
1474	.layer_cfg_almost_empty_int = 1,
1475	.layer_cfg_almost_full_int = 1};
1476
1477	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1478	.calc_done_int = 0,
1479	.layer_cfg_almost_empty_int = 1,
1480	.layer_cfg_almost_full_int = 1};
1481	layer.interrupt_enabe.data.int_en = 1;
1482	}
1483
1484	kpu_send_layer((const kpu_layer_argument_t *)&layer);
1485	}
1486
1487	static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t arg, kpu_model_context_t ctx)
1488	{
1489	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1490	#if USE_CACHED_AI_RAM
1491	uint8_t dest = (uint8_t )(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
1492	#else
1493	uint8_t dest = (uint8_t )(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
1494	#endif
1495
1496	uint32_t row_padding = 16;
1497	uint32_t row_group = 4;
1498	uint32_t row_length = 1;
1499	uint32_t height = 4;
1500	uint32_t oc, x, y, channels = arg->channels;
1501
1502	for(oc = 0; oc < channels; oc++)
1503	{
1504	uint8_t channel_origin = dest + oc / row_group row_length * height * 64 + oc % row_group * row_padding;
1505	for(y = 0; y < 1; y++)
1506	{
1507	uint8_t y_origin = channel_origin + y row_length * 64;
1508	for(x = 0; x < 1; x++)
1509	y_origin[x] = *src++;
1510	}
1511	}
1512
1513	#if USE_CACHED_AI_RAM
1514	uint32_t lines = row_length * height * channels / row_group;
1515	kpu_flush_cache(arg->kpu_mem_out_address, lines);
1516	#endif
1517	}
1518
1519	static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t arg, kpu_model_context_t ctx)
1520	{
1521	const uint8_t src = (const uint8_t )(ctx->main_buffer + arg->main_mem_in_address);
1522	uint8_t dest = (uint8_t )(ctx->main_buffer + arg->main_mem_out_address);
1523	uint32_t oc, channels = arg->channels;
1524
1525	for(oc = 0; oc < channels; oc++)
1526	dest++ = src[oc 16];
1527	}
1528
1529	static void kpu_upload(const kpu_model_upload_layer_argument_t arg, kpu_model_context_t ctx)
1530	{
1531	size_t width = arg->width;
1532	size_t height = arg->height;
1533	size_t channels = arg->channels;
1534
1535	kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
1536	}
1537
1538	int kpu_load_kmodel(kpu_model_context_t ctx, const uint8_t buffer)
1539	{
1540	#if FIX_CACHE
1541	configASSERT(is_memory_cache((uintptr_t)buffer));
1542	#endif
1543	uintptr_t base_addr = (uintptr_t)buffer;
1544	const kpu_kmodel_header_t header = (const kpu_kmodel_header_t )buffer;
1545
1546	if (header->version == 3 && header->arch == 0)
1547	{
1548	ctx->is_nncase = 0;
1549	ctx->model_buffer = buffer;
1550	ctx->output_count = header->output_count;
1551	ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
1552	ctx->layer_headers = (const kpu_model_layer_header_t )((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) ctx->output_count);
1553	ctx->layers_length = header->layers_length;
1554	ctx->body_start = (const uint8_t )((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) header->layers_length);
1555	ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
1556	if (!ctx->main_buffer)
1557	return -1;
1558	uint32_t body_size = 0;
1559	for (int i=0; i<ctx->layers_length; i++)
1560	{
1561	const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
1562	body_size += cnt_layer_header->body_size;
1563	}
1564	uint8_t body_start_iomem = (uint8_t )((uintptr_t)ctx->body_start - IOMEM);
1565	const uint8_t *body_start_cache = ctx->body_start;
1566	memcpy(body_start_iomem, body_start_cache, body_size);
1567	for (int i=0; i<body_size; i++)
1568	{
1569	configASSERT(body_start_iomem[i] == body_start_cache[i]);
1570	}
1571
1572	} else
1573	{
1574	return -1;
1575	}
1576
1577	return 0;
1578	}
1579
1580	int kpu_get_output(kpu_model_context_t ctx, uint32_t index, uint8_t data, size_t size)
1581	{
1582	if(ctx->is_nncase)
1583	return -1;
1584
1585	if(index >= ctx->output_count)
1586	return -1;
1587
1588	const kpu_model_output_t *output = ctx->outputs + index;
1589	*data = ctx->main_buffer + output->address;
1590	*size = output->size;
1591	return 0;
1592	}
1593
1594	void kpu_model_free(kpu_model_context_t *ctx)
1595	{
1596	if(ctx->is_nncase)
1597	return;
1598
1599	free(ctx->main_buffer);
1600	ctx->main_buffer = NULL;
1601	}
1602
1603	#if KPU_DEBUG
1604	static uint64_t last_time;
1605	static uint64_t total_time;
1606	static uint64_t kpu_time;
1607	static uint32_t last_layer_type;
1608
1609	static const char *str_layer_type(uint32_t type)
1610	{
1611	switch(type)
1612	{
1613	case KL_ADD:
1614	return "Add";
1615	case KL_QUANTIZED_ADD:
1616	return "QuantAdd";
1617	case KL_GLOBAL_AVERAGE_POOL2D:
1618	return "GAP";
1619	case KL_QUANTIZED_MAX_POOL2D:
1620	return "QuantMaxPool2d";
1621	case KL_AVERAGE_POOL2D:
1622	return "AveragePool2d";
1623	case KL_QUANTIZE:
1624	return "Quantize";
1625	case KL_DEQUANTIZE:
1626	return "Dequantize";
1627	case KL_REQUANTIZE:
1628	return "Requantize";
1629	case KL_L2_NORMALIZATION:
1630	return "L2Norm";
1631	case KL_SOFTMAX:
1632	return "Softmax";
1633	case KL_CONCAT:
1634	return "Concat";
1635	case KL_QUANTIZED_CONCAT:
1636	return "QuantConcat";
1637	case KL_FULLY_CONNECTED:
1638	return "FullyConnected";
1639	case KL_TENSORFLOW_FLATTEN:
1640	return "TFFlatten";
1641	case KL_RESIZE_NEAREST_NEIGHBOR:
1642	return "ResizeNearestNeighbor";
1643	case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
1644	return "QuantResizeNearestNeighbor";
1645	case KL_CHANNELWISE_DEQUANTIZE:
1646	return "ChannelwiseDequantize";
1647	case KL_LOGISTIC:
1648	return "Logistic";
1649	case KL_K210_CONV:
1650	return "K210Conv";
1651	case KL_K210_ADD_PADDING:
1652	return "K210AddPad";
1653	case KL_K210_REMOVE_PADDING:
1654	return "K210RemovePad";
1655	case KL_K210_UPLOAD:
1656	return "K210Upload";
1657	default:
1658	return "Unknown";
1659	}
1660	}
1661	#endif
1662
1663	static int kpu_kmodel_done(kpu_model_context_t *ctx)
1664	{
1665	kpu->interrupt_clear.data = (kpu_config_interrupt_t){
1666	.calc_done_int = 1,
1667	.layer_cfg_almost_empty_int = 1,
1668	.layer_cfg_almost_full_int = 1};
1669	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1670	.calc_done_int = 1,
1671	.layer_cfg_almost_empty_int = 1,
1672	.layer_cfg_almost_full_int = 1};
1673	#if KPU_DEBUG
1674	uint32_t cnt_layer_id = ctx->current_layer - 1;
1675	uint64_t time = sysctl_get_time_us();
1676	if(last_time != 0)
1677	{
1678	uint64_t layer_time = time - last_time;
1679	syslog(LOG_NOTICE, "layer %d [%s]: %f ms", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);
1680	total_time += layer_time;
1681	if(last_layer_type == KL_K210_CONV)
1682	kpu_time += layer_time;
1683	}
1684
1685	syslog(LOG_NOTICE, "KPU: %f ms", kpu_time / 1000.0);
1686	syslog(LOG_NOTICE, "CPU: %f ms", (total_time - kpu_time) / 1000.0);
1687	syslog(LOG_NOTICE, "Model: %f ms", total_time / 1000.0);
1688	#endif
1689	ctx->done_callback(ctx->userdata);
1690	return 0;
1691	}
1692
1693	static int ai_step(void *userdata)
1694	{
1695	kpu_model_context_t ctx = (kpu_model_context_t )userdata;
1696
1697	uint32_t cnt_layer_id = ctx->current_layer;
1698	const uint8_t *layer_body = ctx->current_body;
1699	const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
1700	if (cnt_layer_id >= ctx->layers_length) {
1701	//syslog(LOG_NOTICE, "overrun");
1702	kpu_kmodel_done(ctx);
1703	return -1;
1704	}
1705
1706	ctx->current_layer++;
1707	ctx->current_body += cnt_layer_header->body_size;
1708
1709	#if KPU_DEBUG
1710	uint64_t time = sysctl_get_time_us();
1711	if(last_time != 0)
1712	{
1713	uint64_t layer_time = time - last_time;
1714	syslog(LOG_NOTICE, "layer %d/%d [%s]: %d.%03d ms", cnt_layer_id, ctx->layers_length, str_layer_type(last_layer_type), layer_time / 1000, layer_time % 1000);
1715	total_time += layer_time;
1716	if(last_layer_type == KL_K210_CONV)
1717	kpu_time += layer_time;
1718	}
1719
1720	last_layer_type = cnt_layer_header->type;
1721	last_time = sysctl_get_time_us();
1722	#endif
1723
1724	switch(cnt_layer_header->type)
1725	{
1726	case KL_ADD:
1727	kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
1728	break;
1729	case KL_QUANTIZED_ADD:
1730	kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
1731	break;
1732	case KL_GLOBAL_AVERAGE_POOL2D:
1733	kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
1734	break;
1735	case KL_QUANTIZED_MAX_POOL2D:
1736	kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
1737	break;
1738	case KL_AVERAGE_POOL2D:
1739	kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
1740	break;
1741	case KL_QUANTIZE:
1742	kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
1743	break;
1744	case KL_DEQUANTIZE:
1745	kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
1746	break;
1747	case KL_REQUANTIZE:
1748	kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
1749	break;
1750	case KL_L2_NORMALIZATION:
1751	kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
1752	break;
1753	case KL_SOFTMAX:
1754	kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
1755	break;
1756	case KL_CONCAT:
1757	case KL_QUANTIZED_CONCAT:
1758	kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
1759	break;
1760	case KL_FULLY_CONNECTED:
1761	kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
1762	break;
1763	case KL_TENSORFLOW_FLATTEN:
1764	kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
1765	break;
1766	case KL_RESIZE_NEAREST_NEIGHBOR:
1767	kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
1768	break;
1769	case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
1770	kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
1771	break;
1772	case KL_CHANNELWISE_DEQUANTIZE:
1773	kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
1774	break;
1775	case KL_LOGISTIC:
1776	kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
1777	break;
1778	case KL_K210_CONV:
1779	kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
1780	return 0;
1781	case KL_K210_ADD_PADDING:
1782	kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
1783	break;
1784	case KL_K210_REMOVE_PADDING:
1785	kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
1786	break;
1787	case KL_K210_UPLOAD:
1788	kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
1789	break;
1790	default:
1791	assert(!"Layer is not supported.");
1792	kpu_kmodel_done(ctx);
1793	return -1;
1794	}
1795
1796	if (ctx->current_layer < ctx->layers_length)
1797	ai_step(userdata);
1798	else
1799	kpu_kmodel_done(ctx);
1800	return 0;
1801	}
1802
1803	static void ai_step_not_isr(void *userdata)
1804	{
1805	sysctl_disable_irq();
1806	ai_step(userdata);
1807	sysctl_enable_irq();
1808	}
1809
1810	int kpu_run_kmodel(kpu_model_context_t ctx, const uint8_t src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
1811	{
1812	if(ctx->is_nncase)
1813	return -1;
1814
1815	ctx->dma_ch = dma_ch;
1816	ctx->done_callback = done_callback;
1817	ctx->userdata = userdata;
1818	ctx->current_layer = 0;
1819	ctx->current_body = ctx->body_start;
1820	#if KPU_DEBUG
1821	last_time = 0;
1822	total_time = 0;
1823	kpu_time = 0;
1824	#endif
1825
1826	kpu_kmodel_header_t header = (kpu_kmodel_header_t )ctx->model_buffer;
1827	kpu->interrupt_clear.reg = 7;
1828	kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
1829	.fifo_full_threshold = 10, .fifo_empty_threshold = 1};
1830	kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
1831	.eight_bit_mode = header->flags & 1};
1832	kpu->interrupt_mask.data = (kpu_config_interrupt_t){
1833	.calc_done_int = 1,
1834	.layer_cfg_almost_empty_int = 0,
1835	.layer_cfg_almost_full_int = 1};
1836
1837	plic_set_priority(INTNO_AI, 1);
1838	plic_irq_register(INTNO_AI, ai_step, ctx);
1839	plic_irq_enable(INTNO_AI);
1840
1841	const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
1842
1843	switch(first_layer_header->type)
1844	{
1845	case KL_K210_CONV:
1846	{
1847	const kpu_model_conv_layer_argument_t first_layer = (const kpu_model_conv_layer_argument_t )ctx->body_start;
1848	kpu_layer_argument_t layer_arg = (volatile kpu_layer_argument_t )(ctx->model_buffer + first_layer->layer_offset);
1849
1850	if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
1851	{
1852	kpu_kmodel_input_with_padding(&layer_arg, src);
1853	ai_step_not_isr(ctx);
1854	} else
1855	{
1856	kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
1857	}
1858	}
1859	break;
1860	case KL_FULLY_CONNECTED:
1861	{
1862	const kpu_model_fully_connected_layer_argument_t first_layer = (const kpu_model_fully_connected_layer_argument_t )ctx->body_start;
1863	kpu_kmodel_input_float((const float )src, (float )(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
1864	ai_step_not_isr(ctx);
1865	}
1866	break;
1867	default:
1868	return -1;
1869	}
1870
1871	return 0;
1872	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: